## GROUP 1 - Python
* DEL CARPIO CUENCA, GABRIEL SEBASTIAN
* ESPINOSA CALDERON, MAURICIO GUSTAVO
* JAIME MARTINEZ, KEVIN OSWALDO
* MELLIZO ANTAZU, MILAGROS ESTEFANY
* QUISPE ROBLADILLO, ALMENDRA VALERIA

In [18]:
import pandas as pd
import numpy as np
import itertools
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from formulaic import Formula
import warnings
warnings.filterwarnings("ignore")

# 1 Data analysis

1. Import the data set. Make sure the column names are imported as intended.

In [19]:
# Data
data = pd.read_csv('wage2015_subsample_inference.csv') 
data = data.drop('rownames',axis=1)
data
#list(data.columns)

Unnamed: 0,wage,lwage,sex,shs,hsg,scl,clg,ad,mw,so,we,ne,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
0,9.615385,2.263364,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,7.0,0.49,0.343,0.2401,3600.0,11,8370.0,18
1,48.076923,3.872802,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,31.0,9.61,29.791,92.3521,3050.0,10,5070.0,9
2,11.057692,2.403126,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,3.24,5.832,10.4976,6260.0,19,770.0,4
3,13.942308,2.634928,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,25.0,6.25,15.625,39.0625,420.0,1,6990.0,12
4,28.846154,3.361977,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,22.0,4.84,10.648,23.4256,2015.0,6,9470.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5145,14.769231,2.692546,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9.0,0.81,0.729,0.6561,4700.0,16,4970.0,9
5146,23.076923,3.138833,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12.0,1.44,1.728,2.0736,4110.0,13,8680.0,20
5147,38.461538,3.649659,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,11.0,1.21,1.331,1.4641,1550.0,4,3680.0,6
5148,32.967033,3.495508,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,1.00,1.000,1.0000,2920.0,9,6570.0,11


2. As in Group Assignment 1, generate the extra-flexible model. This means that it contains all two-way interactions between the experience polynomials and the indicator variables

2.1. Generate the array for the outcome variable $Y$ and normalize it

In [20]:
y = data["lwage"].to_numpy().reshape(-1, 1)
# Normalize Y
y_normalized = StandardScaler().fit_transform(y)

In [21]:
#print(y_normalized)
#print(y_normalized.shape)

2.2. Generate the array for the predictors 
$X$ (do not generate an intercept) and normalize its colums.

In [22]:
# Matrix
extra_flexible = Formula("sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + C(occ2) + C(ind2)) ^ 2").get_model_matrix(data)
# Eliminate intercept
extra_flexible = extra_flexible.loc[:, extra_flexible.columns != 'Intercept']
# conitnue variable
cont = ['exp1', 'exp2', 'exp3', 'exp4']

# Normalization
extra_flexible[cont] = StandardScaler().fit_transform(extra_flexible[cont])
# Final array
X_normalized = extra_flexible.to_numpy()

print(X_normalized.shape)

(5150, 979)


3. Split between training and testing samples. The testing sample should be 10% of the total.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, train_size = .9, shuffle = True)

## 2. Creating the Lasso Cross-Validation Procedure

4. Program a function that generates a logarithmically spaced grid. 

In [24]:
def log_spaced_grid(lower_bound, upper_bound, num_points):
    log_lower = np.log(lower_bound)
    log_upper = np.log(upper_bound)
    log_grid = np.linspace(log_lower, log_upper, num_points)
    return np.exp(log_grid)

5. Program a function to generate $k$ folds.

In [25]:
def create_k_folds(X, k):
    # Number of samples
    total_samples = X.shape[0] 
    
    # Size of each fold
    fold_sizes = np.full(k, total_samples // k, dtype=int)
    fold_sizes[:total_samples % k] += 1  
    
    # Generate shuffled indices
    indices = np.arange(total_samples)
    np.random.shuffle(indices)
    
    folds = []  # Store folds here
    current_index = 0
    
    # Create the folds
    for size in fold_sizes:
        start, end = current_index, current_index + size
        mask = np.zeros(total_samples, dtype=bool)
        mask[indices[start:end]] = True
        folds.append(mask)
        current_index = end
    
    return folds

6. Program a function that integrates those that you programmed in the last two items to find the value of 
$λ$ that minimizes the testing mean square error across folds. It should take the following inputs:

* Y: an array for the outcome variable.
* X: an array of predictors.
* lambda_bounds: the lower and upper bounds of the grid of lambda values.
* k: number of folds

In [26]:
def opt_lambda(Y, X, lambda_bounds, k):
    # Grid of lambda values
    lambdas = log_spaced_grid(lambda_bounds[0], lambda_bounds[1], num_points=100)
    
    # k folds 
    folds = create_k_folds(X, k)
    
    # Array to store MSE values
    all_mse = np.zeros((len(lambdas), k))
    
    # Evaluating each lambda
    for i, lambda_val in enumerate(lambdas):
        for j, fold in enumerate(folds):
            # Split data into training and testing sets
            X_train, X_test = X[~fold], X[fold]
            Y_train, Y_test = Y[~fold], Y[fold]
            
            # Fit Lasso model
            model = Lasso(alpha=lambda_val, fit_intercept=True)
            model.fit(X_train, Y_train)
            
            # Make predictions
            Y_pred = model.predict(X_test)
            all_mse[i, j] = np.mean((Y_test - Y_pred) ** 2)

    # MSE 
    avg_mse = np.mean(all_mse, axis=1)
    
    # optimal lambda
    optimal_index = np.argmin(avg_mse)
    optimal_lambda_value = lambdas[optimal_index]
    optimal_model = Lasso(alpha=optimal_lambda_value, fit_intercept=True)
    optimal_model.fit(X, Y)
    
    result = {
        'optimal_lambda': optimal_lambda_value,
        'optimal_coef': optimal_model.coef_,
        'all_lambdas': lambdas,
        'all_mse': avg_mse
    }
    
    return result


7. Program a function for predicting the outcome variable through model estimated with the optimal lambda. It should take as inputs
* optimal_model: A dictionary with the values outputed by the function defined for the previous point.
* X: an array of predictors.

The output should be an array of predicted values.

In [27]:
def lasso_predict(optimal_model, X):
    
    optimal_lambda = optimal_model.get('optimal_lambda')
    optimal_coef = optimal_model.get('optimal_coef')
    lasso_model = Lasso(alpha=optimal_lambda, fit_intercept=True)
    
    lasso_model.coef_ = optimal_coef  
    lasso_model.intercept_ = 0       # no intercept
    
    # Predictions
    predicciones = lasso_model.predict(X)
    
    return predicciones  

## 3. Applying the Lasso Cross-Validation Procedure

8. Fit a simple OLS model with the training sample.

In [28]:
# OLS model
ols = LinearRegression().fit(X_train, y_train)  

In [29]:
# Y predicted
y_pred = ols.predict(X_test)

In [30]:
# Metrics for the ols model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE (OLS): {mse:.4f}")
print(f"R^2 (OLS): {r2:.4f}")


MSE (OLS): 0.2394
R^2 (OLS): 0.2322


9. Find the optimal lambda and its related coefficients with the function programmed in 3. and the training sample. Print the lambda and the coefficients.

In [None]:
lambda_bounds = (0.1, 1)
result = opt_lambda(y_train, X_train, lambda_bounds, k=5)
# Results
print(f"Optimal lambda: {result['optimal_lambda']}")
print(f"Coefficients: {result['optimal_coef']}")

10. Use each language's hdm package to fit a Lasso model with the theoretical optimal lambda value

In [31]:
# !git clone https://github.com/maxhuppertz/hdmpy


Cloning into 'hdmpy'...


In [34]:
import sys
sys.path.append(r"C:\Users\almen\OneDrive\Escritorio\ABC_SUBIR\CausalAI-Course\Labs\Assignment\Assigment_2\hdmpy")

import hdmpy as hdm

In [None]:
optimal_lambda = result['optimal_lambda']  

# Lasso model
lasso_hdm = hdm.LassoHDM(alpha=optimal_lambda)  
lasso_hdm.fit(X_train, y_train)  

# Coefficients
coefficients = lasso_hdm.coef_  

# Results
print(f"Optimal lambda: {optimal_lambda}")  
print(f"Coefficients from hdm Lasso model: {coefficients}")  

11. Report the testing MSE and  $R^2$ for the OLS model and, the cross-validation lambda Lasso model, and the hdm theoretical lambda model. 

In [37]:
# Metrics:
# OLS
print(f"MSE (OLS): {mse:.4f}")
print(f"R^2 (OLS): {r2:.4f}")

MSE (OLS): 0.2394
R^2 (OLS): 0.2322


In [None]:
# Lasso cross validation
mse_lasso_cv = mean_squared_error(y_test, lasso_predict(result, X_test))
r2_lasso_cv = r2_score(y_test, lasso_predict(result, X_test))
print(f"MSE (Lasso CV): {mse_lasso_cv:.4f}")
print(f"R^2 (Lasso CV): {r2_lasso_cv:.4f}")

In [None]:
# Lasso  hdmpy
y_pred_lasso_hdm = lasso_hdm.predict(X_test)
mse_lasso_hdm = mean_squared_error(y_test, y_pred_lasso_hdm)
r2_lasso_hdm = r2_score(y_test, y_pred_lasso_hdm)
print(f"MSE (Lasso HDM): {mse_lasso_hdm:.4f}")
print(f"R^2 (Lasso HDM): {r2_lasso_hdm:.4f}")