### Cross-Validation in Lasso Regression - Manual Implementation Task

In [1]:
# Libraries
import pandas as pd
import numpy as np
import pyreadr as rr
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Import relevant packages for lasso 
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error



**1. Data Preparation**

Load the March Supplement of the U.S. Current Population Survey, year 2015. (wage2015_subsample_inference.Rdata)

In [2]:
rdata_read = rr.read_r("../../data/wage2015_subsample_inference.Rdata")
data = rdata_read[ 'data' ]

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150 entries, 10 to 32643
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   wage    5150 non-null   float64 
 1   lwage   5150 non-null   float64 
 2   sex     5150 non-null   float64 
 3   shs     5150 non-null   float64 
 4   hsg     5150 non-null   float64 
 5   scl     5150 non-null   float64 
 6   clg     5150 non-null   float64 
 7   ad      5150 non-null   float64 
 8   mw      5150 non-null   float64 
 9   so      5150 non-null   float64 
 10  we      5150 non-null   float64 
 11  ne      5150 non-null   float64 
 12  exp1    5150 non-null   float64 
 13  exp2    5150 non-null   float64 
 14  exp3    5150 non-null   float64 
 15  exp4    5150 non-null   float64 
 16  occ     5150 non-null   category
 17  occ2    5150 non-null   category
 18  ind     5150 non-null   category
 19  ind2    5150 non-null   category
dtypes: category(4), float64(16)
memory usage: 736.3+ KB


We separate the X and y from flexible model

In [4]:
# 2. flexible model
flex = 'lwage ~ sex + shs+hsg+scl+clg+occ2+ind2+mw+so+we + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)'
flex_results_0 = smf.ols(flex, data=data)
flex_results = smf.ols(flex, data=data).fit()
print(flex_results.summary()) # estimated coefficients
print( "Number of regressors in the basic model:",len(flex_results.params), '\n') # number of regressors in the Flexible Model

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     10.83
Date:                Wed, 10 Apr 2024   Prob (F-statistic):          2.69e-305
Time:                        23:12:28   Log-Likelihood:                -3301.9
No. Observations:                5150   AIC:                             7096.
Df Residuals:                    4904   BIC:                             8706.
Df Model:                         245                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           3.2797      0.284     

In [5]:
# Get exogenous variables from flexible model
X = flex_results_0.exog
X.shape

(5150, 246)

In [6]:
# Set endogenous variable
lwage = data["lwage"]
lwage.shape

(5150,)

**2. Define a Range of Alpha (Lambda in our equation) Values**

Create a list or array of alpha values to iterate over. These will be the different regularization strengths you will test. You should use at least 5 different values of alpha. Hint: You can use small numbers from 0.1 to 0.5

In [7]:
alpha_values = [round(0.1 + i * 0.1, 1) for i in range(5)]

**3. Partition the Dataset for k-Fold Cross-Validation**

Divide the dataset into k subsets (or folds). If you are working with a regression task, ensure the data is shuffled. For classification tasks, the folds should be stratified.

check: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [9]:
kf = KFold(n_splits=3)

for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}:")
     print(f"  Train: index={train_index}")
     print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[1717 1718 1719 ... 5147 5148 5149]
  Test:  index=[   0    1    2 ... 1714 1715 1716]
Fold 1:
  Train: index=[   0    1    2 ... 5147 5148 5149]
  Test:  index=[1717 1718 1719 ... 3431 3432 3433]
Fold 2:
  Train: index=[   0    1    2 ... 3431 3432 3433]
  Test:  index=[3434 3435 3436 ... 5147 5148 5149]


**4. Lasso Regression Implementation**

Implement a function to fit a Lasso Regression model given a training dataset and an alpha value. The function should return the model's coefficients and intercept.

In [None]:
def cvK(X, y, tuning_params, partitions, k):
  n_tuning_params = tuning_params.shape[0]

  partition = partitions[k]
  TRAIN = np.delete(np.arange(0, X.shape[0]), partition)
  TEST = partition
  X_train = X[TRAIN, :]
  y_train = y[TRAIN]

  X_test = X[TEST, :]
  y_test = y[TEST]

  accuracies = np.zeros(n_tuning_params)
  for i in range(0, n_tuning_params):
    lasso = svm.SVC(C = tuning_params[i], kernel = "linear") # change for lasso
    accuracies[i] = svc.fit(X_train, y_train).score(X_test, y_test) # change for lasso
  return accuracies

In [None]:
K = 5
tuning_params = np.logspace(-6, -1, 10)
partitions = np.array_split(np.random.permutation([i for i in range(0, X.shape[0])]), K)

Accuracies = [cvK(X, y, tuning_params, partitions, k) for k in range(0, K)]

CV_accuracy = np.mean(Accuracies, axis = 0)
best_tuning_param = tuning_params[np.argmax(CV_accuracy)]
print(best_tuning_param)