In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from utils import gather_data, mean_squared_percentage_error

# LASSO Model for Feature Selection
The goal of this IPython notebook is to run a LASSO model and use its results to determine if any features are extraneous.

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [4]:
stats = {}
for l1 in [1e-5, 1e-4, 1e-3, 0.01, 0.1, 0.5, 1]:
    lasso = Lasso(alpha=l1)
    lasso.fit(x_train, y_train)
    stats[l1] = {}
    stats[l1]["MSE"] = mean_squared_error(y_test, lasso.predict(x_test))
    stats[l1]["R2"] = r2_score(y_test, lasso.predict(x_test))
    stats[l1]["MSPE"] = mean_squared_percentage_error(y_test, lasso.predict(x_test))
    stats[l1]["coef"] = lasso.coef_
stats

{1e-05: {'MSE': 0.25713672443725305,
  'R2': 0.8101095268790461,
  'MSPE': 1.3602337975345378,
  'coef': array([ 2.1178995 ,  1.56771813,  1.83978337,  0.91066813,  0.53044798,
         -1.15713972,  1.97306178,  0.04336621, -0.80520365])},
 0.0001: {'MSE': 0.25693980075339273,
  'R2': 0.8102549511920407,
  'MSPE': 1.3594881663598308,
  'coef': array([ 2.12378027,  1.55632517,  1.83508557,  0.90934462,  0.52804258,
         -1.15206406,  1.96760085,  0.02029405, -0.80108403])},
 0.001: {'MSE': 0.25588719609910715,
  'R2': 0.8110322792701247,
  'MSPE': 1.3575817845339877,
  'coef': array([ 2.19377416,  1.51725962,  1.73634423,  0.8852941 ,  0.50351197,
         -1.11554415,  1.93465893, -0.        , -0.74809429])},
 0.01: {'MSE': 0.27082588552158526,
  'R2': 0.80000034748967,
  'MSPE': 1.4490460255272826,
  'coef': array([ 2.90482274,  1.19578384,  0.69874324,  0.63419314,  0.25729436,
         -0.764823  ,  1.62755798, -0.        , -0.20739185])},
 0.1: {'MSE': 0.7984275468274187,
  'R

In [5]:
best_l1 = 1e-5
for l1 in [1e-5, 1e-4, 1e-3, 0.01, 0.1, 0.5, 1]:
    if stats[l1]["R2"] > stats[best_l1]["R2"]:
        best_l1 = l1
print(stats[best_l1])

{'MSE': 0.25588719609910715, 'R2': 0.8110322792701247, 'MSPE': 1.3575817845339877, 'coef': array([ 2.19377416,  1.51725962,  1.73634423,  0.8852941 ,  0.50351197,
       -1.11554415,  1.93465893, -0.        , -0.74809429])}


The only unused feature is negative affect in the best LASSO model. This makes sense as its corollary feature positive affect is also present, so these values are likely heavily correlated and only one is useful.