In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from itertools import product
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


# Load data (replace with your URL or file path if needed)
file_path = 'C:/Users/Usuario/Desktop/Cursos PUCP/Ciclo 2024-2/wage2015_subsample_inference.csv'
data = pd.read_csv(file_path)

df_lwage = data[['lwage']]

# Experience polynomials and indicator variables
exp_vars = ['exp1', 'exp2', 'exp3', 'exp4']
indicator_vars = ['hsg', 'scl', 'clg', 'ad', 'so', 'we', 'ne']

# Step 1: Create dummies for occ2 and ind2
data_with_dummies = pd.get_dummies(data, columns=['occ2', 'ind2'], drop_first=True)

# Step 2: Add the dummy variables to the list of variables for interaction
dummy_vars = [col for col in data_with_dummies.columns if col.startswith('occ2_') or col.startswith('ind2_')]
interaction_vars = exp_vars + indicator_vars + dummy_vars

# Step 3: Create two-way interactions (including interactions with sex)
interaction_terms = []
for var1, var2 in product(['sex'] + interaction_vars, repeat=2):
    if var1 != var2:  # Exclude self-interactions
        interaction_terms.append(f"{var1}_x_{var2}")  # Storing interaction as string

# Step 4: Create a DataFrame for interactions
interaction_df = pd.DataFrame(index=data_with_dummies.index)

# Calculate interactions and add them to the interaction DataFrame
for term in interaction_terms:
    var1, var2 = term.split('_x_')  # Extracting the variables from the interaction term
    interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]

# Concatenate the interaction terms with the original data
data_with_interactions = pd.concat([data_with_dummies, interaction_df], axis=1)

# Step 5: Prepare the outcome variable (Y) and normalize it
Y = df_lwage.values.flatten()
scaler_y = StandardScaler()
Y_normalized = scaler_y.fit_transform(Y.reshape(-1, 1)).flatten()

# Step 6: Prepare the predictor matrix (X) without an intercept
basic_vars_with_dummies = ['sex'] + interaction_vars + interaction_terms
X = data_with_interactions[basic_vars_with_dummies].values

# Normalize the predictor matrix (X)
scaler_x = StandardScaler()
X_normalized = scaler_x.fit_transform(X)

# Step 7: Split the data into training and testing sets (90% train, 10% test)
X_train, X_test, Y_train, Y_test = train_test_split(X_normalized, Y_normalized, test_size=0.1, random_state=42)

  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = data_with_dummies[var1] * data_with_dummies[var2]
  interaction_df[term] = 

In [2]:
# Print out the shapes of the datasets to confirm
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (4635, 2809)
X_test shape: (515, 2809)
Y_train shape: (4635,)
Y_test shape: (515,)


In [3]:

# Function to generate a logarithmically spaced grid
def log_spaced_grid(lower_bound, upper_bound, log_spacing):
    log_lower = np.log(lower_bound)
    log_upper = np.log(upper_bound)
    grid = np.exp(np.arange(log_lower, log_upper, log_spacing))
    return grid

# Function to generate k folds for cross-validation
def generate_k_folds(array, k):
    n = array.shape[0]
    indices = np.random.permutation(n)
    fold_sizes = [n // k] * k
    for i in range(n % k):
        fold_sizes[i] += 1
    
    folds = []
    current_index = 0
    for size in fold_sizes:
        fold_mask = np.zeros(n, dtype=bool)
        fold_mask[indices[current_index:current_index + size]] = True
        folds.append(fold_mask)
        current_index += size
    
    return folds

# Function to perform cross-validation and find the optimal lambda
def lasso_cross_validation(Y, X, lambda_bounds, log_spacing, k):
    # Step 1: Generate grid of lambda values
    lambda_grid = log_spaced_grid(lambda_bounds[0], lambda_bounds[1], log_spacing)
    
    # Step 2: Generate k folds for cross-validation
    folds = generate_k_folds(X, k)
    
    all_mse = []  # To store the MSE for each lambda
    optimal_lambda = None
    optimal_coef = None
    min_mse = float('inf')
    
    # Step 3: Loop over each lambda value
    for lamb in lambda_grid:
        mse_fold = []  # Store MSE for each fold for the current lambda
        
        # Step 4: Loop over each fold
        for fold in folds:
            # Split the data into training and testing sets based on the fold
            X_train, X_test = X[~fold], X[fold]
            Y_train, Y_test = Y[~fold], Y[fold]
            
            # Step 5: Fit Lasso model with the current lambda
            lasso_model = Lasso(alpha=lamb, max_iter=50000)
            lasso_model.fit(X_train, Y_train)
            
            # Step 6: Predict on the test set
            Y_pred = lasso_model.predict(X_test)
            
            # Step 7: Calculate the MSE for the fold
            mse = mean_squared_error(Y_test, Y_pred)
            mse_fold.append(mse)
        
        # Average MSE across all folds for the current lambda
        avg_mse = np.mean(mse_fold)
        all_mse.append(avg_mse)
        
        # Step 8: Check if the current lambda yields a lower MSE
        if avg_mse < min_mse:
            min_mse = avg_mse
            optimal_lambda = lamb
            optimal_coef = lasso_model.coef_
    
    # Step 9: Return results in a dictionary
    results = {
        'optimal_lambda': optimal_lambda,
        'optimal_coef': optimal_coef,
        'all_lambdas': lambda_grid,
        'all_mse': np.array(all_mse)
    }
    
    return results

In [4]:
# Set lambda bounds, log spacing, and number of folds
lambda_bounds = (1e-4, 1e2)
log_spacing = 0.5
k_folds = 5

# Find the optimal lambda using cross-validation
results = lasso_cross_validation(Y_train, X_train, lambda_bounds, log_spacing, k_folds)

# Print results
print("Optimal Lambda:", results['optimal_lambda'])
print("Optimal Coefficients:", results['optimal_coef'])
print("All Lambda Values:", results['all_lambdas'])
print("All MSE values:", results['all_mse'])

# Evaluate the model with the optimal lambda on the test set
lasso_model_optimal = Lasso(alpha=results['optimal_lambda'], max_iter=50000)
lasso_model_optimal.fit(X_train, Y_train)
Y_pred_test = lasso_model_optimal.predict(X_test)
test_mse = mean_squared_error(Y_test, Y_pred_test)

print(f"Test MSE with optimal lambda: {test_mse}")