# Ridge Regression


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import gather_data, mean_squared_percentage_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(data)

[[0.2892698  0.45066231 0.64734587 ... 0.41429704 0.25819549 0.61207211]
 [0.31694052 0.55230844 0.65178176 ... 0.48142144 0.23709242 0.6115452 ]
 [0.33432687 0.5390752  0.65621765 ... 0.51690668 0.27532381 0.29935741]
 ...
 [0.43496329 0.75916231 0.68579032 ... 0.65843368 0.23535433 0.45645502]
 [0.42205181 0.71724266 0.69281387 ... 0.66065806 0.34573638 0.57730186]
 [0.42783685 0.6851505  0.69983736 ... 0.60991722 0.24168187 0.66570318]]


In [4]:
print(label)

[3.7235899  4.40177822 4.75838089 ... 2.69352317 3.1598022  3.15457821]


In [5]:
''' Split data into testing and training set with 80% for training and 20% for testing '''
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [6]:
x_train

array([[0.74405835, 0.92781067, 0.84400415, ..., 0.6406585 , 0.15541473,
        0.75972617],
       [0.6280678 , 0.63935614, 0.92089312, ..., 0.5790717 , 0.35042667,
        0.50697839],
       [0.62946575, 0.85013747, 0.85997336, ..., 0.65389031, 0.32941639,
        0.53804529],
       ...,
       [0.64758501, 0.90077788, 0.9040367 , ..., 0.81482398, 0.27811393,
        0.29270831],
       [0.42783685, 0.6851505 , 0.69983736, ..., 0.60991722, 0.24168187,
        0.66570318],
       [0.87190125, 0.89216608, 0.94307262, ..., 0.70711368, 0.19643481,
        0.62293565]])

In [7]:
''' Try different alpha values with grid search and k fold cross validation (with 10 folds)'''
param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))

Best parameters:  {'alpha': 1}
Best cross-validation score:  0.7767231151850428
Test set score:  0.7924081622740287


In [8]:
''' Train model with the best parameters '''
model = Ridge(alpha=0.1).fit(X=x_train, y=y_train)
y_pred = model.predict(x_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

MSE: 0.26883945698320977
R2: 0.792883318348627
MSPE: 1.2661112172506996


In [9]:
''' Basis function expansion with k fold cross validation（10 folds) while testing different parameters w/ gridsearch cv '''
degrees= [1,2,3,4,5,6,7,8,9,10]

def poly_alphas_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}
        
        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = poly_alphas_k_fold_cv(degrees)
print(results)

Degree:  1
MSE:  0.2694562141659668
MSPE:  1.2687413174029314
R2 Score:  0.7924081622740287
Coefficients:  [ 2.14943696  1.59624896  1.68500519  0.87385923  0.39690635 -1.17421388
  1.83261645 -0.1689297  -0.74844847]
Best parameters:  {'alpha': 1}
Best cross-validation score:  0.7767231151850428 

Degree:  2
MSE:  0.23193154065061886
MSPE:  1.043225117680832
R2 Score:  0.821317556548826
Coefficients:  [ 0.08883467 -3.96614263 -3.30225628  2.5208538   2.96818499  2.49990558
  1.13032888  0.9060206   2.38635879  1.45986803 -4.03823634  2.60941326
 -3.25161912  2.25035574  2.29122047  4.87983821 -3.80010735  0.04002852
  2.69751063  7.06920651  3.06639025  0.42957436 -3.53134721 -3.30562452
  5.46687167 -1.21820878 -1.47539784  1.414773   -2.47629663  2.75514288
 -5.62232628 -3.44849061  4.13329763  0.17906089  0.10663933 -3.15451833
 -0.02647792 -3.50095063 -1.0225678  -0.47506482 -0.07275782 -3.58639648
  0.41845203 -0.01716691 -0.17442838 -1.05621231 -4.16229479  0.54620463
  4.057720

MSE:  0.20666944878357713
MSPE:  0.9369798267921146
R2 Score:  0.8407797318477465
Coefficients:  [ 0.65146901  0.35609902  0.13290968 ... -0.30603633 -0.40984965
  0.20123231]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8258853229628267 

Degree:  6
MSE:  0.1987970964717187
MSPE:  0.9015831042226672
R2 Score:  0.8468446730060099
Coefficients:  [ 0.73619636  0.60190529  0.14635447 ... -0.32302013 -0.28208595
 -0.01275655]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8277837102670761 

Degree:  7
MSE:  0.19199106087886525
MSPE:  0.8673699997266319
R2 Score:  0.8520881128009391
Coefficients:  [ 0.69280154  0.60505503  0.10169202 ... -0.27066273 -0.16223624
  0.02299264]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8289564533994023 

Degree:  8
MSE:  0.1880305280300484
MSPE:  0.8453866893908702
R2 Score:  0.8551393480266873
Coefficients:  [ 0.59490913  0.49148405  0.03567456 ... -0.20016536 -0.06123966
  0.18325755]
Best paramete

In [12]:
''' Find degree with best R^2 value '''
max_score = 0
max_degree = 0
for degree, stats in results.items():
    if stats['R2: '] > max_score:
        max_score = stats['R2: ']
        max_degree = degree
        
        
print(f'Degree with best score: ', max_degree)
print(f'Corresponding r-squared value: ', max_score)

Degree with best score:  8
Corresponding r-squared value:  0.8551393480266873


In [13]:
''' Train model with the best degree (from basis function expansion) and best parameters '''
poly = PolynomialFeatures(degree = 8)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)

model = Ridge(alpha=0.1).fit(X=x_train_poly, y=y_train)
y_pred = model.predict(x_test_poly)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

MSE: 0.18803052803035875
R2: 0.8551393480264482
MSPE: 0.8453866893931209
