# Ridge Regression


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import gather_data, mean_squared_percentage_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [4]:
print(data)

[[0.2892698  0.45066231 0.64734587 ... 0.41429704 0.25819549 0.61207211]
 [0.31694052 0.55230844 0.65178176 ... 0.48142144 0.23709242 0.6115452 ]
 [0.33432687 0.5390752  0.65621765 ... 0.51690668 0.27532381 0.29935741]
 ...
 [0.43496329 0.75916231 0.68579032 ... 0.65843368 0.23535433 0.45645502]
 [0.42205181 0.71724266 0.69281387 ... 0.66065806 0.34573638 0.57730186]
 [0.42783685 0.6851505  0.69983736 ... 0.60991722 0.24168187 0.66570318]]


In [5]:
print(label)

[3.7235899  4.40177822 4.75838089 ... 2.69352317 3.1598022  3.15457821]


In [6]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [7]:
x_train

array([[0.7524493 , 0.91092735, 0.83328408, ..., 0.58674085, 0.14249723,
        0.5843038 ],
       [0.62595324, 0.76682776, 0.89132044, ..., 0.48930058, 0.31451604,
        0.1134464 ],
       [0.43515825, 0.65929979, 0.67654886, ..., 0.63463145, 0.36709338,
        0.62122107],
       ...,
       [0.47409341, 0.61740076, 0.6854207 , ..., 0.69269502, 0.37802941,
        0.71481675],
       [0.79937991, 0.91351074, 0.90344523, ..., 0.6232006 , 0.25280887,
        0.2062867 ],
       [0.59894794, 0.70291817, 0.90019222, ..., 0.47950411, 0.42925256,
        0.79117841]])

In [9]:
param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))

Best parameters:  {'alpha': 1}
Best cross-validation score:  0.787453633019639
Test set score:  0.7539868397853904


In [10]:
degrees= [1,2,3,4,5,6,7,8,9,10]

def poly_alphas_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {"alpha":[0.001, 0.01, 0.1, 1]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(Ridge(), param_grid, cv=10)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}
        
        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = poly_alphas_k_fold_cv(degrees)
print(results)

Degree:  1
MSE:  0.308836584169313
MSPE:  1.3692715295816742
R2 Score:  0.7539868397853904
Coefficients:  [ 2.18882646  1.65718022  1.62819208  1.0183995   0.50704229 -1.07207273
  1.76993343 -0.15978827 -0.70349786]
Best parameters:  {'alpha': 1}
Best cross-validation score:  0.787453633019639 

Degree:  2
MSE:  0.2554497858414649
MSPE:  1.1288055751081374
R2 Score:  0.7965137153033943
Coefficients:  [-0.11312123 -2.73292053 -5.19866065  2.42123741  2.76361434  2.86290337
 -1.94684441 -0.21934786  3.38216034  0.98876989 -4.14934204  4.78830723
 -3.0110674   1.98585184  2.28283603  3.48689126 -4.05371122 -0.10323742
  2.63691552  6.04544572  2.4336308  -0.02845744 -4.36915853 -2.4774705
  5.8452479  -0.6304532  -1.95082393  0.2568807  -3.08106552  3.28607063
 -1.82735266 -2.98499321  3.77948501  0.69590143  0.4000414  -3.31169162
  0.68030297 -1.09595603 -1.14576973 -1.03398217 -0.39877792 -1.9910049
  1.63148713  0.11074887 -0.26173277 -0.52297964 -3.71364466  0.1846069
  4.00987564  

MSE:  0.22468640348542238
MSPE:  0.9984502558227251
R2 Score:  0.8210192217758765
Coefficients:  [ 0.51868884  0.53757653  0.01469028 ... -0.17826061 -0.24854708
  0.21407688]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8321851124834954 

Degree:  6
MSE:  0.21829887567536976
MSPE:  0.9785436033449357
R2 Score:  0.8261074010365572
Coefficients:  [ 0.52143423  0.8314936   0.10223697 ... -0.22245792 -0.2267698
 -0.09801136]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8338453692105009 

Degree:  7
MSE:  0.21573100012093335
MSPE:  0.9737771690636494
R2 Score:  0.8281529202935582
Coefficients:  [ 0.42367728  0.87918887  0.09883482 ... -0.19441267 -0.18719224
 -0.16852537]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.8344803285832544 

Degree:  8
MSE:  0.22384697433800357
MSPE:  0.9977168744817768
R2 Score:  0.8216878945559755
Coefficients:  [ 0.15293423  0.1898451  -0.00065455 ... -0.03083914 -0.03518174
 -0.04420044]
Best paramet

In [20]:
max_score = 0
max_degree = 0
for degree, stats in results.items():
    if stats['R2: '] > max_score:
        max_score = stats['R2: ']
        max_degree = degree
        
        
print(f'Degree with best score: ', max_degree)
print(f'Corresponding r-squared value: ', max_score)

Degree with best score:  7
Corresponding r-squared value:  0.8281529202935582
