# Ridge Regression


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import gather_data, mean_squared_percentage_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [2]:
x_train, x_test, y_train, y_test = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(x_train)

[[0.59691466 0.54038894 0.84252554 ... 0.40171635 0.24253631 0.61630201]
 [0.3711317  0.63775593 0.74567504 ... 0.64917082 0.24598596 0.89956087]
 [0.92383477 0.95814401 0.94750857 ... 0.74425793 0.21106331 0.57473975]
 ...
 [0.54243438 0.81330019 0.81502294 ... 0.78726292 0.33403653 0.68708342]
 [0.44528346 0.83141011 0.70708269 ... 0.66721171 0.19417724 0.40184477]
 [0.70435793 0.92250597 0.84666569 ... 0.54543263 0.20621966 0.50782478]]


In [4]:
print(y_train)

[4.10183716 2.90273428 7.04073143 ... 5.31255007 4.54733515 5.87646627]


In [5]:
''' Try different alpha values with grid search and k fold cross validation (with 10 folds)'''

param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))

Best parameters:  {'alpha': 1}
Best cross-validation score:  0.7673660773430219
Test set score:  0.8114508518140432


In [6]:
''' Results on model with the best parameters on training data '''

model = Ridge(alpha=1).fit(X=x_train, y=y_train)
y_train_pred = model.predict(x_train)
print('Training results of model with best alpha = 1:\n')
print(f"R2: {r2_score(y_train, y_train_pred)}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_train, y_train_pred)}")

Training results of model with best alpha = 1:

R2: 0.7058949401360483
MSE: 0.28695419851422693
MSPE: 1.3874206812213723


In [7]:
''' Results on model with the best parameters on testing data '''

model = Ridge(alpha=1).fit(X=x_train, y=y_train)
y_pred = model.predict(x_test)
print('Testing results of model with best alpha = 1:\n')
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

Testing results of model with best alpha = 1:

R2: 0.8114508518140432
MSE: 0.2626773531329258
MSPE: 1.355472703558621


In [8]:
''' Basis function expansion with k fold cross validation（10 folds) while testing different parameters w/ gridsearch cv '''

degrees= [1,2,3,4,5,6,7,8,9,10]

def poly_alphas_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Alpha: ': grid_search.best_params_, 'Coefficients: ': coefficients}
        
        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = poly_alphas_k_fold_cv(degrees)
print(results)

Degree:  1
MSE:  0.2626773531329258
MSPE:  1.3554727035586203
R2 Score:  0.8114508518140432
Coefficients:  [ 2.08998458  1.59663455  1.70469817  0.98236942  0.41532465 -1.10079894
  1.92176893  0.02415932 -0.65362029]
Best parameters:  {'alpha': 1}
Best cross-validation score:  0.7673660773430219 

Degree:  2
MSE:  0.21365149756785587
MSPE:  1.0558128487710754
R2 Score:  0.8466414885234208
Coefficients:  [-3.59809446e-01 -3.67657792e+00 -3.35111425e+00  3.11069130e+00
  3.53744458e+00  3.01602151e+00 -9.71443116e-01  2.20955615e+00
  3.18294894e+00  1.44622106e+00 -5.83839157e+00  3.99921272e+00
 -2.75806361e+00  2.58624827e+00  3.41646380e+00  4.68364923e+00
 -3.94614423e+00 -5.34419820e-01  3.79978217e+00  7.15502632e+00
  3.21635657e+00 -2.33965696e-01 -3.82481532e+00 -4.87088433e+00
  4.59424452e+00 -6.86584403e-01 -2.14390542e+00 -1.17331816e+00
 -3.58336862e+00  9.85335067e-01 -9.87290194e-01 -2.48440566e+00
  4.35590299e+00  7.87491741e-01  5.78515608e-01 -3.54141928e+00
  8.111

MSE:  0.1874490084115335
MSPE:  0.8993949110700755
R2 Score:  0.8654495698134598
Coefficients:  [ 0.52799254  0.54393753  0.22424868 ... -0.13699343 -0.24256436
  0.08766144]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8143737215954392 

Degree:  6
MSE:  0.18541814104898915
MSPE:  0.8835938952454421
R2 Score:  0.8669073213353149
Coefficients:  [ 0.58876835  0.83856822  0.29945984 ... -0.23534682 -0.26902452
 -0.09340214]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8172321897080366 

Degree:  7
MSE:  0.18462407354332536
MSPE:  0.8818056509006503
R2 Score:  0.8674773010081316
Coefficients:  [ 0.51683457  0.83189114  0.29088237 ... -0.23647869 -0.20717829
 -0.06549261]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8202781156058876 

Degree:  8
MSE:  0.18485493682399212
MSPE:  0.8901902615628138
R2 Score:  0.8673115879217236
Coefficients:  [ 0.39186829  0.67353233  0.23333442 ... -0.18134087 -0.10481617
  0.05953582]
Best paramet

In [12]:
''' Find degree with best R^2 value '''

max_score = 0
max_degree = 0
best_alpha = 0
for degree, stats in results.items():
    if stats['R2: '] > max_score:
        max_score = stats['R2: ']
        max_degree = degree
        best_alpha = stats['Alpha: ']
        
        
print(f'Degree with best score: ', max_degree)
print(f'Corresponding r-squared value: ', max_score)
print(f'Best alpha value: ', best_alpha)

Degree with best score:  7
Corresponding r-squared value:  0.8674773010081316
Best alpha value:  {'alpha': 0.1}


In [19]:
''' Results on model with the best parameters on training data '''

poly = PolynomialFeatures(degree = 7)
x_train_poly = poly.fit_transform(x_train)

model = Ridge(alpha=0.1).fit(X=x_train_poly, y=y_train)
y_train_pred = model.predict(x_train_poly)

print('Training results of model with best degree = 7 and alpha = 0.1:\n')
print(f"R2: {r2_score(y_train, y_train_pred)}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_train, y_train_pred)}")

Training results of model with best degree = 7 and alpha = 0.1:

R2: 0.8682073469863125
MSE: 0.1694809696964119
MSPE: 0.7657550116665577


In [20]:
''' Results on model with the best parameters on testing data '''

x_test_poly = poly.fit_transform(x_test)
y_pred = model.predict(x_test_poly)

print('Testing results of model with best degree = 7 and alpha = 0.1:\n')
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

Testing results of model with best degree = 7 and alpha = 0.1:

R2: 0.8674773010083061
MSE: 0.18462407354308222
MSPE: 0.8818056508980487
