# Ridge Regression


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import gather_data, mean_squared_percentage_error

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(pd.DataFrame(data))

             0         1         2         3         4         5         6  \
0     0.289270  0.450662  0.647346  0.718114  0.173169  0.881686  0.414297   
1     0.316941  0.552308  0.651782  0.678896  0.195469  0.850035  0.481421   
2     0.334327  0.539075  0.656218  0.600127  0.125859  0.706766  0.516907   
3     0.329900  0.521104  0.660654  0.495901  0.167723  0.731109  0.479835   
4     0.343900  0.520637  0.665089  0.530935  0.241247  0.775620  0.613513   
...        ...       ...       ...       ...       ...       ...       ...   
1742  0.442230  0.754147  0.671743  0.752826 -0.113937  0.751208  0.733641   
1743  0.447607  0.775388  0.678767  0.762675 -0.084747  0.844209  0.657524   
1744  0.434963  0.759162  0.685790  0.631908 -0.081540  0.830652  0.658434   
1745  0.422052  0.717243  0.692814  0.643303 -0.029376  0.788523  0.660658   
1746  0.427837  0.685151  0.699837  0.667636 -0.109439  0.756945  0.609917   

             7         8  
0     0.258195  0.612072  
1     0.2

In [4]:
print(data)

[[0.2892698  0.45066231 0.64734587 ... 0.41429704 0.25819549 0.61207211]
 [0.31694052 0.55230844 0.65178176 ... 0.48142144 0.23709242 0.6115452 ]
 [0.33432687 0.5390752  0.65621765 ... 0.51690668 0.27532381 0.29935741]
 ...
 [0.43496329 0.75916231 0.68579032 ... 0.65843368 0.23535433 0.45645502]
 [0.42205181 0.71724266 0.69281387 ... 0.66065806 0.34573638 0.57730186]
 [0.42783685 0.6851505  0.69983736 ... 0.60991722 0.24168187 0.66570318]]


In [5]:
print(label)

[3.7235899  4.40177822 4.75838089 ... 2.69352317 3.1598022  3.15457821]


In [6]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [7]:
x_train

array([[0.86963011, 0.95870245, 0.93538378, ..., 0.74534106, 0.23269877,
        0.29256928],
       [0.40408633, 0.56703901, 0.39331658, ..., 0.5502581 , 0.24485564,
        0.45666796],
       [0.7585062 , 0.88702017, 0.85546353, ..., 0.62099171, 0.18952154,
        0.47770366],
       ...,
       [0.85245969, 0.92593759, 0.93479231, ..., 0.64691454, 0.23081189,
        0.34976545],
       [0.19896349, 0.42224002, 0.66242794, ..., 0.57171529, 0.19034494,
        0.85119969],
       [0.78705687, 0.90528989, 0.91113408, ..., 0.64671546, 0.33349761,
        0.45241681]])

In [8]:
model = Ridge(alpha=100).fit(X=x_train, y=y_train)

print("R-squared value for training set: ", r2_score(y_train, model.predict(x_train)))
print("R-squared value for testing set: ", r2_score(y_test, model.predict(x_test)))

R-squared value for training set:  0.576939025696424
R-squared value for testing set:  0.5819084401915762


In [9]:
print(mean_squared_percentage_error(y_test, model.predict(x_test)))
print(model.coef_)

2.27183552927527
[ 1.2067912   0.73376199  0.6563281   0.59784408  0.2495387  -0.64831039
  0.50451657 -0.20164115 -0.23120324]


In [10]:
param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))

Best parameters:  {'alpha': 1}
Best cross-validation score:  0.782372796330527
Test set score:  0.7694072938349543


In [11]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [12]:
x_train = pd.DataFrame(x_train)
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.869630,0.958702,0.935384,0.834730,0.312668,0.579600,0.745341,0.232699,0.292569
1,0.404086,0.567039,0.393317,0.412588,0.200928,0.681960,0.550258,0.244856,0.456668
2,0.758506,0.887020,0.855464,0.714466,-0.074105,0.823048,0.620992,0.189522,0.477704
3,0.456463,0.792133,0.799054,0.719029,-0.099997,0.922627,0.593906,0.146872,0.457723
4,0.655946,0.889900,0.910543,0.790898,-0.100814,0.842899,0.803392,0.291769,0.271787
...,...,...,...,...,...,...,...,...,...
1392,0.619874,0.532586,0.845483,0.658724,-0.270160,0.320888,0.443486,0.250088,0.610842
1393,0.549274,0.756932,0.861748,0.873569,0.020092,0.782105,0.735027,0.281604,0.384267
1394,0.852460,0.925938,0.934792,0.800878,0.164432,0.792179,0.646915,0.230812,0.349765
1395,0.198963,0.422240,0.662428,0.489863,-0.061204,0.677108,0.571715,0.190345,0.851200


In [13]:
degrees= [1,2,3,4,5,6,7,8,9, 10]

def poly_alphas_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}
        
        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = poly_alphas_k_fold_cv(degrees)
print(results)

Degree:  1
MSE:  0.2763590878602458
MSPE:  1.2161922738888746
R2 Score:  0.7694072938349543
Coefficients:  [ 2.18099334  1.46370738  1.79615339  0.87957833  0.46504741 -1.10265852
  2.00574557 -0.20254652 -0.71116297]
Best parameters:  {'alpha': 1}
Best cross-validation score:  0.782372796330527 

Degree:  2
MSE:  0.24223770454871896
MSPE:  1.0594336755767255
R2 Score:  0.7978780134947279
Coefficients:  [ 0.69447045 -3.784288   -4.77205593  1.78787675  4.12398887  2.50504536
 -0.74692017  0.74486008  2.37809225  0.55412607 -4.56955982  5.30248841
 -3.05549945  2.28034548  1.53221966  4.73241396 -4.56567294 -1.08698397
  3.77571316  5.64548251  3.95555997  0.33221673 -3.92116819 -4.64046507
  5.4557266  -1.20798279 -2.39183387  0.44329928 -3.66282224  3.75307284
 -2.98091725 -2.55719582  5.23062882 -0.72354152  0.26643028 -2.70992922
  1.25604372 -2.32428771  0.22573678 -0.38396745 -0.02038586 -3.66869205
  0.56762316 -0.48006345 -0.43884806 -0.72217545 -3.63038349 -0.01502118
  4.30193

In [15]:
poly = PolynomialFeatures(degree = 4, include_bias = False, interaction_only = False)

In [16]:
x_train_poly = poly.fit_transform(x_train)
x_train_poly = pd.DataFrame(data = x_train_poly)
x_train_poly

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,704,705,706,707,708,709,710,711,712,713
0,0.869630,0.958702,0.935384,0.834730,0.312668,0.579600,0.745341,0.232699,0.292569,0.756257,...,0.047552,0.009392,0.011808,0.014846,0.018666,0.002932,0.003686,0.004635,0.005827,0.007327
1,0.404086,0.567039,0.393317,0.412588,0.200928,0.681960,0.550258,0.244856,0.456668,0.163286,...,0.063144,0.008078,0.015066,0.028098,0.052404,0.003595,0.006704,0.012503,0.023319,0.043491
2,0.758506,0.887020,0.855464,0.714466,-0.074105,0.823048,0.620992,0.189522,0.477704,0.575332,...,0.088001,0.004227,0.010655,0.026857,0.067696,0.001290,0.003252,0.008197,0.020660,0.052076
3,0.456463,0.792133,0.799054,0.719029,-0.099997,0.922627,0.593906,0.146872,0.457723,0.208358,...,0.073899,0.001882,0.005864,0.018275,0.056954,0.000465,0.001450,0.004519,0.014085,0.043894
4,0.655946,0.889900,0.910543,0.790898,-0.100814,0.842899,0.803392,0.291769,0.271787,0.430265,...,0.047677,0.019955,0.018588,0.017315,0.016129,0.007247,0.006751,0.006288,0.005858,0.005456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,0.619874,0.532586,0.845483,0.658724,-0.270160,0.320888,0.443486,0.250088,0.610842,0.384244,...,0.073387,0.006937,0.016943,0.041384,0.101080,0.003912,0.009554,0.023337,0.057000,0.139224
1393,0.549274,0.756932,0.861748,0.873569,0.020092,0.782105,0.735027,0.281604,0.384267,0.301702,...,0.079776,0.016414,0.022398,0.030564,0.041706,0.006289,0.008581,0.011710,0.015979,0.021804
1394,0.852460,0.925938,0.934792,0.800878,0.164432,0.792179,0.646915,0.230812,0.349765,0.726688,...,0.051197,0.007955,0.012054,0.018267,0.027681,0.002838,0.004301,0.006517,0.009876,0.014966
1395,0.198963,0.422240,0.662428,0.489863,-0.061204,0.677108,0.571715,0.190345,0.851200,0.039586,...,0.236822,0.003943,0.017632,0.078847,0.352593,0.001313,0.005870,0.026251,0.117391,0.524960


In [17]:
x_test_poly = poly.fit_transform(x_test)
x_test_poly = pd.DataFrame(data = x_test_poly)
x_test_poly

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,704,705,706,707,708,709,710,711,712,713
0,0.849922,0.937495,0.934201,0.812733,0.288102,0.418611,0.712405,0.209572,0.440121,0.722367,...,0.098310,0.006557,0.013771,0.028920,0.060735,0.001929,0.004051,0.008508,0.017867,0.037522
1,0.690167,0.950980,0.889250,0.882047,-0.048693,0.911756,0.825811,0.232063,0.360040,0.476330,...,0.088402,0.010320,0.016012,0.024842,0.038542,0.002900,0.004500,0.006981,0.010831,0.016804
2,0.504471,0.843355,0.819607,0.898377,0.071607,0.783429,0.796158,0.248383,0.495585,0.254491,...,0.155681,0.012200,0.024342,0.048569,0.096907,0.003806,0.007594,0.015152,0.030233,0.060322
3,0.899804,0.946834,0.953423,0.947621,0.252847,0.298814,0.796321,0.209410,0.586872,0.809647,...,0.218405,0.007313,0.020494,0.057434,0.160960,0.001923,0.005389,0.015104,0.042328,0.118624
4,0.671333,0.670689,0.861008,0.777061,-0.285531,0.723407,0.509633,0.240044,0.443021,0.450688,...,0.050976,0.007049,0.013010,0.024010,0.044313,0.003320,0.006128,0.011309,0.020872,0.038521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.871351,0.959701,0.943073,0.941436,0.219611,0.191016,0.801433,0.217578,0.579889,0.759253,...,0.215985,0.008255,0.022001,0.058637,0.156279,0.002241,0.005973,0.015919,0.042428,0.113078
346,0.477599,0.904273,0.889102,0.917871,0.206922,0.902516,0.659908,0.226463,0.582115,0.228101,...,0.147565,0.007664,0.019701,0.050640,0.130170,0.002630,0.006761,0.017378,0.044671,0.114825
347,0.836346,0.872384,0.951944,0.737739,-0.061615,0.921075,0.535316,0.235880,0.333916,0.699475,...,0.031952,0.007026,0.009946,0.014079,0.019931,0.003096,0.004382,0.006204,0.008782,0.012432
348,0.788716,0.904635,0.944921,0.838069,-0.227364,0.922192,0.659149,0.326253,0.354542,0.622073,...,0.054614,0.022890,0.024875,0.027032,0.029376,0.011330,0.012312,0.013380,0.014540,0.015801


In [18]:
param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

# fit the grid search object on the training data
grid_search.fit(X=x_train_poly, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test_poly, y_test))

Best parameters:  {'alpha': 0.01}
Best cross-validation score:  0.8346263683422606
Test set score:  0.8036969587749181
