# Ridge Regression


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from utils import gather_data, mean_squared_percentage_error

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(pd.DataFrame(data))

             0         1          2         3         4         5         6  \
0     7.302574  0.450662  50.500000  0.718114  0.173169  0.881686  0.414297   
1     7.472446  0.552308  50.799999  0.678896  0.195469  0.850035  0.481421   
2     7.579183  0.539075  51.099998  0.600127  0.125859  0.706766  0.516907   
3     7.552006  0.521104  51.400002  0.495901  0.167723  0.731109  0.479835   
4     7.637953  0.520637  51.700001  0.530935  0.241247  0.775620  0.613513   
...        ...       ...        ...       ...       ...       ...       ...   
1742  8.241609  0.754147  52.150002  0.752826 -0.113937  0.751208  0.733641   
1743  8.274620  0.775388  52.625000  0.762675 -0.084747  0.844209  0.657524   
1744  8.196998  0.759162  53.099998  0.631908 -0.081540  0.830652  0.658434   
1745  8.117733  0.717243  53.575001  0.643303 -0.029376  0.788523  0.660658   
1746  8.153248  0.685151  54.049999  0.667636 -0.109439  0.756945  0.609917   

             7         8  
0     0.258195  0.612072

In [4]:
print(data)

[[ 7.30257368  0.45066231 50.5        ...  0.41429704  0.25819549
   0.61207211]
 [ 7.47244644  0.55230844 50.79999924 ...  0.48142144  0.23709242
   0.6115452 ]
 [ 7.57918262  0.5390752  51.09999847 ...  0.51690668  0.27532381
   0.29935741]
 ...
 [ 8.19699764  0.75916231 53.09999847 ...  0.65843368  0.23535433
   0.45645502]
 [ 8.117733    0.71724266 53.57500076 ...  0.66065806  0.34573638
   0.57730186]
 [ 8.15324783  0.6851505  54.04999924 ...  0.60991722  0.24168187
   0.66570318]]


In [5]:
print(label)

[3.7235899  4.40177822 4.75838089 ... 2.69352317 3.1598022  3.15457821]


In [6]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [7]:
x_train

array([[ 5.52672339,  0.88767153, 64.40000153, ...,  0.72955298,
         0.35095045,  0.26013625],
       [ 8.54010963,  0.66911107, 57.25      , ...,  0.71478927,
         0.24751894,  0.67028856],
       [ 7.21011782,  0.59073722, 47.04000092, ...,  0.58781487,
         0.36960143,  0.68757826],
       ...,
       [10.47578526,  0.92935014, 66.34999847, ...,  0.51809251,
         0.21355972,  0.3237581 ],
       [ 9.35310936,  0.88496137, 63.92499924, ...,  0.54996824,
         0.21962425,  0.1256534 ],
       [ 7.65750265,  0.48355186, 52.        , ...,  0.54741699,
         0.2733281 ,  0.48284727]])

In [8]:
model = Ridge(alpha=100).fit(X=x_train, y=y_train)

print("R-squared value for training set: ", r2_score(y_train, model.predict(x_train)))
print("R-squared value for testing set: ", r2_score(y_test, model.predict(x_test)))

R-squared value for training set:  0.7055722543040251
R-squared value for testing set:  0.719184761142188


In [9]:
print(mean_squared_percentage_error(y_test, model.predict(x_test)))
print(model.coef_)

1.7951065496867402
[ 0.43762674  0.26640076  0.04744484  0.3206959   0.28291171 -0.35519124
  0.39114412 -0.12916618  0.02616558]


In [10]:
param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))

Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.7797597223293579
Test set score:  0.7796467652581058


In [11]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [12]:
x_train = pd.DataFrame(x_train)
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,5.526723,0.887672,64.400002,0.625526,0.130596,0.839340,0.729553,0.350950,0.260136
1,8.540110,0.669111,57.250000,0.783046,0.076573,0.838610,0.714789,0.247519,0.670289
2,7.210118,0.590737,47.040001,0.716396,0.149183,0.924901,0.587815,0.369601,0.687578
3,9.190257,0.904828,62.080002,0.719413,0.406894,0.970144,0.757047,0.241678,0.653533
4,9.057288,0.796106,65.599998,0.914904,-0.075258,0.663289,0.825840,0.289977,0.702381
...,...,...,...,...,...,...,...,...,...
1392,10.223091,0.858734,66.599998,0.557721,-0.199827,0.907530,0.649912,0.244536,0.279212
1393,10.311100,0.881854,68.349998,0.830843,-0.123667,0.639480,0.565855,0.203388,0.502480
1394,10.475785,0.929350,66.349998,0.698945,-0.239050,0.851745,0.518093,0.213560,0.323758
1395,9.353109,0.884961,63.924999,0.502542,0.009299,0.891075,0.549968,0.219624,0.125653


In [15]:
degrees= [1,2,3,4,5,6,7,8,9]

def poly_alphas_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}
        
        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = poly_alphas_k_fold_cv(degrees)
print(results)

    
    
    

Degree:  1
MSE:  0.28621844617557907
MSPE:  1.4403849050161723
R2 Score:  0.7796467652581058
Coefficients:  [ 0.332842    1.7114041   0.02603298  0.77820578  0.47052234 -1.2764679
  2.01157569  0.02974885 -0.71161031]
Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.7797597223293579 

Degree:  2
MSE:  0.24073127832741176
MSPE:  1.0863307188939242
R2 Score:  0.8146663270946006
Coefficients:  [ 5.17507214e-02 -2.78228159e+00 -1.77483235e-01  5.39401256e+00
  2.82931782e+00  2.97509871e-01 -3.89963503e+00  5.21886101e+00
  3.68676986e+00  3.36659681e-02 -1.18072649e+00  1.48898032e-02
 -4.32950545e-01  5.10311976e-01  2.84493889e-01  5.51418269e-01
 -1.29262851e+00 -3.33820853e-01  3.50017701e+00  1.47316263e-01
  3.39897961e+00 -1.89845345e+00 -2.73219988e+00 -2.21165142e+00
  7.60027112e+00 -4.00211293e-01 -1.12517069e-03  4.06334110e-03
 -5.78623425e-02  7.64726592e-02 -4.32576296e-02  1.91910814e-02
  9.60863603e-02  3.41184873e-01  7.60318231e-01 -4.74345647e+00
  5.1

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


MSE:  0.4968304286635361
MSPE:  2.8758025623280914
R2 Score:  0.617501270316263
Coefficients:  [-6.31369984e-01  4.91791508e-01 -6.32063649e-01  1.98961677e-01
  3.76160276e-01 -3.81693087e-02  5.52773010e-02 -3.49238852e-01
  6.08743811e-01 -1.32003482e+00 -2.25748883e-01  4.24954898e-01
  5.58850944e-01  5.01319783e-01 -5.38653934e-01 -3.33847849e-01
 -3.02916750e-01  2.27265836e-01 -1.91259822e-01 -1.77920909e-02
 -1.59173144e-02 -1.98934567e-01  5.93095723e-01 -5.75264981e-02
  1.84647898e-01 -3.03543218e-02 -2.30514301e-02  7.61863174e-02
 -4.30027800e-02  5.08057347e-02 -2.59268238e-02  2.70274072e-01
 -4.96615508e-02  8.28582441e-02  2.75430229e-01  1.70109913e-01
  7.17053247e-02  6.06356672e-02  7.00708204e-02  2.65413642e-01
  1.80252693e-01  2.71807267e-01  1.75085312e-01  5.23438834e-01
  1.84967601e-01  7.72642902e-02 -3.23870440e-01  5.34765171e-01
 -1.54577879e-01  7.04558161e-02  7.54283854e-02  7.13938018e-02
  1.88340966e-01  9.27692390e-01 -5.49126530e-03  2.27798044

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return li

MSE:  0.33036291788393757
MSPE:  1.67589112186462
R2 Score:  0.7456609154050129
Coefficients:  [-6.94576636e-04  1.45961465e-03  9.56283496e-05  2.20520491e-04
  7.02076784e-04 -1.56893847e-04  7.06712224e-06 -9.68343336e-04
  1.09574368e-03 -3.62303768e-03  3.54190578e-03 -1.27736772e-02
  8.19497561e-04  2.35960668e-03 -1.25344321e-03 -8.95153680e-04
 -2.85058596e-03  3.61125054e-03  3.77572560e-04  2.81974686e-02
  4.28392054e-04 -3.06536959e-04  1.31413592e-03  2.37824077e-04
  3.96687880e-04  5.77801681e-04 -3.22589239e-03  3.33970742e-03
  1.55817194e-02 -6.12129783e-04 -8.52255567e-04 -1.69101845e-02
  1.97774779e-02  1.63767218e-04  5.19626432e-04  1.09889823e-04
  1.40345538e-04 -2.32510514e-04  1.53992223e-04  3.25008156e-04
  3.21803565e-04  3.74428283e-04  3.00399077e-04  7.98425356e-04
  2.23300084e-04 -5.71643084e-06 -8.31584709e-04  8.28060475e-04
 -3.19346494e-04 -1.51613958e-05  1.93608075e-04 -2.70970957e-04
  2.78683432e-05  1.12656846e-03 -6.21614609e-03 -3.09315297



MSE:  0.5334144587882551
MSPE:  3.0427803351328464
R2 Score:  0.5893360367836502
Coefficients:  [-7.04177840e-08  3.12025871e-07 -1.04450709e-06 ...  9.08156209e-09
  8.01924571e-08  7.43357507e-07]
Best parameters:  {'alpha': 1}
Best cross-validation score:  0.6905548678063795 

Degree:  6




MSE:  0.23622961126753417
MSPE:  0.9883088898322265
R2 Score:  0.8181320607383594
Coefficients:  [ 1.34998161e-10  4.81297456e-11 -7.80939011e-10 ...  9.49705942e-11
  1.23859612e-10  4.89259934e-11]
Best parameters:  {'alpha': 100}
Best cross-validation score:  0.5571904416270449 

Degree:  7




MSE:  0.35302525786365035
MSPE:  1.4918616284630102
R2 Score:  0.7282136823979308
Coefficients:  [ 1.23671402e-13 -4.10013622e-15 -8.22968445e-13 ...  1.93813603e-14
  2.57512466e-14  1.46462358e-14]
Best parameters:  {'alpha': 0.001}
Best cross-validation score:  0.2901638486682276 

Degree:  8




MSE:  0.5711729149974062
MSPE:  2.1063560783020914
R2 Score:  0.560266638651838
Coefficients:  [-4.19180731e-17 -3.03493983e-18  1.34363753e-16 ...  1.91998392e-18
  3.33177577e-18  2.87687963e-18]
Best parameters:  {'alpha': 0.001}
Best cross-validation score:  -0.3612322829508492 

Degree:  9




MSE:  0.4023392264636234
MSPE:  1.9795731453531058
R2 Score:  0.6902479515229255
Coefficients:  [-3.04810323e-21 -2.41272415e-22  1.75829678e-21 ... -1.47258941e-22
 -5.68333486e-22 -2.14055822e-21]
Best parameters:  {'alpha': 0.001}
Best cross-validation score:  -0.07175473552429627 

{1: {'MSE: ': 0.28621844617557907, 'MSPE: ': 1.4403849050161723, 'R2: ': 0.7796467652581058, 'Coefficients: ': array([ 0.332842  ,  1.7114041 ,  0.02603298,  0.77820578,  0.47052234,
       -1.2764679 ,  2.01157569,  0.02974885, -0.71161031])}, 2: {'MSE: ': 0.24073127832741176, 'MSPE: ': 1.0863307188939242, 'R2: ': 0.8146663270946006, 'Coefficients: ': array([ 5.17507214e-02, -2.78228159e+00, -1.77483235e-01,  5.39401256e+00,
        2.82931782e+00,  2.97509871e-01, -3.89963503e+00,  5.21886101e+00,
        3.68676986e+00,  3.36659681e-02, -1.18072649e+00,  1.48898032e-02,
       -4.32950545e-01,  5.10311976e-01,  2.84493889e-01,  5.51418269e-01,
       -1.29262851e+00, -3.33820853e-01,  3.50017701e+00, 

In [None]:
x_train_poly = poly.fit_transform(x_train)
x_train_poly = pd.DataFrame(data = x_train_poly)
x_train_poly

In [None]:
x_test_poly = poly.fit_transform(x_test)
x_test_poly = pd.DataFrame(data = x_test_poly)
x_test_poly

In [None]:
param_grid = {"alpha":[0.001, 0.01, 0.1, 1, 10, 100]}

# fit GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(Ridge(), param_grid, cv=5)

# fit the grid search object on the training data
grid_search.fit(X=x_train_poly, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test_poly, y_test))