In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from utils import gather_data, mean_squared_percentage_error

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(data)

[[0.2892698  0.45066231 0.64734587 ... 0.41429704 0.25819549 0.61207211]
 [0.31694052 0.55230844 0.65178176 ... 0.48142144 0.23709242 0.6115452 ]
 [0.33432687 0.5390752  0.65621765 ... 0.51690668 0.27532381 0.29935741]
 ...
 [0.43496329 0.75916231 0.68579032 ... 0.65843368 0.23535433 0.45645502]
 [0.42205181 0.71724266 0.69281387 ... 0.66065806 0.34573638 0.57730186]
 [0.42783685 0.6851505  0.69983736 ... 0.60991722 0.24168187 0.66570318]]


In [4]:
print(label)

[3.7235899  4.40177822 4.75838089 ... 2.69352317 3.1598022  3.15457821]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [6]:
x_train = pd.DataFrame(x_train)
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.495999,0.855703,0.870250,0.817625,0.040930,0.631003,0.775327,0.315903,0.505156
1,0.596354,0.877814,0.885406,0.890671,-0.137969,0.885330,0.721723,0.195284,0.442859
2,0.769585,0.847027,0.909434,0.528629,-0.177682,0.804879,0.379292,0.350773,0.513677
3,0.311889,0.610780,0.682833,0.717770,0.075185,0.873861,0.521151,0.438134,0.817156
4,0.344937,0.616173,0.785968,0.924232,0.057375,0.163810,0.765132,0.308199,0.988120
...,...,...,...,...,...,...,...,...,...
1392,0.441424,0.846308,0.699394,0.708659,0.014951,0.922664,0.705513,0.227972,0.458982
1393,0.794969,0.908348,0.933314,0.887748,0.050461,0.909118,0.593501,0.290812,0.176851
1394,0.298691,0.724308,0.596777,0.306132,0.028461,0.961074,0.571168,0.262727,0.161063
1395,0.490619,0.779225,0.778205,0.527447,-0.153867,0.777314,0.630861,0.272322,0.292110


In [8]:
from sklearn.svm import SVC  
param_grid = {'C': [0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LinearSVR(max_iter = 1000000), param_grid, cv=10)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))


Best parameters:  {'C': 1}
Best cross-validation score:  0.7796812927517935
Test set score:  0.769543556781862


In [None]:
import time
from tqdm import tqdm
degrees= [1,2,3,4,5,6,7,8,9,10]

def polynomial_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in tqdm(degrees):
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {'C': [0.1, 1, 10, 100]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(LinearSVR(max_iter = 1000000), param_grid, cv=10)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}

        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = polynomial_k_fold_cv(degrees)
print(results)

  0%|                                                    | 0/10 [00:00<?, ?it/s]

Degree:  1


 10%|████▍                                       | 1/10 [00:02<00:21,  2.44s/it]

MSE:  0.30784377441023775
MSPE:  1.589927311454641
R2 Score:  0.7690101758961801
Coefficients:  [ 2.38997676  2.04050991  1.19521107  0.97912063  0.3139096  -1.05610938
  1.92371517  0.10150756 -0.46777802]
Best parameters:  {'C': 1}
Best cross-validation score:  0.7795095869286807 

Degree:  2


 20%|████████▊                                   | 2/10 [00:40<03:04, 23.11s/it]

MSE:  0.26395895641462946
MSPE:  1.2357014898339675
R2 Score:  0.8019390418739115
Coefficients:  [ 1.20616579 -6.05489146 -1.95157438  5.70687457  2.90142311  3.43023605
  0.09096631  3.58247785  2.82918124  2.25060904 -7.18270731  4.17678959
 -3.74195886  2.36866847  2.13894327  6.75048003 -6.24337286 -1.56570255
  3.94879982  7.68385894  5.8799588  -0.12019185 -2.46524637 -4.95790733
  7.43126156 -2.30708686 -3.24429643  0.02897165 -4.18206233  2.29809552
 -6.59393969 -4.33745397  7.63640986 -1.77435557  0.34913411 -4.25601651
 -0.11314946 -3.43233339 -1.50041062 -0.48662085 -0.35473275 -1.73974041
  0.96304673  0.80319337 -0.70344868 -0.4229747  -2.95744615 -0.04533131
  5.55344754  7.35846623 -2.74852596  0.44767059 -4.72285454 -2.32127739]
Best parameters:  {'C': 100}
Best cross-validation score:  0.8131537104898671 

Degree:  3


 30%|████████████▉                              | 3/10 [07:57<24:46, 212.30s/it]

MSE:  0.2537497298807596
MSPE:  1.1713969070824277
R2 Score:  0.809599510063702
Coefficients:  [ 1.56501607  0.10645685 -1.39673828  0.6095929   3.71624265 -0.31280003
  2.08410985  0.58569716  2.18885355  4.1823568  -6.80997703  3.34559212
  2.65841156 -2.3030999  -2.13276846  4.74268852  2.08908514 -3.6415734
 -3.111851   -2.06731974  3.71975873  1.13361444 -0.38296015  0.4356786
  3.46873828 -2.92416052 -2.04552014 -1.60260883 -4.30771451  2.25804288
 -1.02799984  3.11588024 -1.46471024  1.65833146  1.82855933 -0.32367186
  2.03254851 -0.66228521 -1.07512872 -7.08448364  1.22032136 -1.57605982
 -0.94322857  4.25323131  0.41021776 -1.32824694 -1.93803877  1.36513666
 -2.52006976  3.43519918  0.48040303 -1.24716623  0.64328042 -0.48280823
 -8.31125383  5.6738663   3.48160024 -2.28987672  3.00159521  3.66728634
  1.00909932 -0.09754511 -1.04057889 -1.90250137 -0.4759938  -1.71449515
 -2.21830793 -4.02937117 -1.87019133 -4.43409987  5.45259194 -0.04785408
  2.26752324  1.69698335  1.967

 40%|████████████████                        | 4/10 [53:58<2:01:51, 1218.59s/it]

MSE:  0.24168572572945987
MSPE:  1.1112023717238673
R2 Score:  0.8186517061077347
Coefficients:  [ 2.66049833e+00  3.22488962e+00 -2.03936549e-01 -5.76447050e-01
  4.88634906e+00 -2.59166235e+00  1.64185750e+00 -1.43330220e+00
  3.33710150e+00  2.10451457e+00 -5.62601289e+00  2.61070321e+00
  2.02511207e+00 -1.70589097e+00 -1.27763369e+00  4.28112611e+00
  3.84973212e+00 -1.48341094e+00 -9.13397392e-01 -1.29944083e+00
  3.67578861e+00  3.06605031e-01  7.21762016e-01  1.15324111e+00
  1.52172609e+00 -1.13129376e+00 -2.52118087e+00 -1.69963348e+00
 -2.62023925e+00  1.73330065e+00  1.65668703e-01  3.33819796e+00
 -1.51885359e+00  2.16761920e-01  2.64554715e+00 -1.44700687e+00
  2.68400479e+00 -8.81583129e-02  9.38677561e-01 -5.03098134e+00
  1.50123777e+00 -1.19457943e+00 -1.57513216e+00  3.37184255e+00
  1.66143210e+00 -3.29190129e+00 -1.96990124e+00 -7.00718399e-01
 -1.88139571e+00  6.59614154e-01  2.60457466e+00 -9.29547215e-01
 -1.05119671e+00  7.50350171e-01 -1.56834550e+00  6.082361

 50%|███████████████████                   | 5/10 [3:37:47<6:00:17, 4323.54s/it]

MSE:  0.24213598160730979
MSPE:  1.1288269022651407
R2 Score:  0.8183138577097104
Coefficients:  [ 1.0396615   0.89978192  0.11659659 ... -0.21343887 -0.89109061
 -0.68000129]
Best parameters:  {'C': 10}
Best cross-validation score:  0.8307231502829848 

Degree:  6


 60%|█████████████████████              | 6/10 [13:24:54<16:28:42, 14830.56s/it]

MSE:  0.23732228645149153
MSPE:  1.0986624570908932
R2 Score:  0.8219258021106068
Coefficients:  [ 0.99922782  0.93454971  0.25082071 ... -0.26630723 -0.52446808
 -0.86167699]
Best parameters:  {'C': 10}
Best cross-validation score:  0.8308282227765569 

Degree:  7
