In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from utils import gather_data, mean_squared_percentage_error
import time

In [2]:
x_train, x_test, y_train, y_test = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
print(x_train)

[[0.59691466 0.54038894 0.84252554 ... 0.40171635 0.24253631 0.61630201]
 [0.3711317  0.63775593 0.74567504 ... 0.64917082 0.24598596 0.89956087]
 [0.92383477 0.95814401 0.94750857 ... 0.74425793 0.21106331 0.57473975]
 ...
 [0.54243438 0.81330019 0.81502294 ... 0.78726292 0.33403653 0.68708342]
 [0.44528346 0.83141011 0.70708269 ... 0.66721171 0.19417724 0.40184477]
 [0.70435793 0.92250597 0.84666569 ... 0.54543263 0.20621966 0.50782478]]


In [4]:
print(y_train)

[4.10183716 2.90273428 7.04073143 ... 5.31255007 4.54733515 5.87646627]


In [5]:
''' Try different C values with grid search and k fold cross validation (with 10 folds)'''

param_grid = {'C': [0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LinearSVR(max_iter = 1000000), param_grid, cv=10)

# fit the grid search object on the training data
grid_search.fit(X=x_train, y=y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# performance of the best found parameters on the test set
print("Test set score: ", grid_search.score(x_test, y_test))


Best parameters:  {'C': 0.1}
Best cross-validation score:  0.7645820072156463
Test set score:  0.8017600439354854


In [6]:
''' Results on model with the best parameters on training data '''

model = LinearSVR(C=0.1).fit(X=x_train, y=y_train)
y_train_pred = model.predict(x_train)

print('Training results of model with best alpha = 0.1:\n')
print(f"R2: {r2_score(y_train, y_train_pred)}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_train, y_train_pred)}")

Training results of model with best alpha = 0.1:

R2: 0.7726516799232959
MSE: 0.29236238033275164
MSPE: 1.440899887915134


In [7]:
''' Results on model with the best parameters on testing data '''

y_pred = model.predict(x_test)

print('Testing results of model with best alpha = 0.1:\n')
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

Testing results of model with best alpha = 0.1:

R2: 0.8018008101923688
MSE: 0.2761213141117576
MSPE: 1.4693637473879915


In [8]:
''' Basis function expansion with k fold cross validation（10 folds) while testing different parameters w/ gridsearch cv '''

degrees= [1,2,3,4,5,6]

def polynomial_k_fold_cv(degrees_lst):
    
    scores = {}
    
    for i in degrees:
        print("Degree: ", i)
        poly = PolynomialFeatures(degree = i, include_bias = False, interaction_only = False)
        x_train_poly = poly.fit_transform(x_train)
        x_train_poly = pd.DataFrame(data = x_train_poly)
        x_test_poly = poly.fit_transform(x_test)
        x_test_poly = pd.DataFrame(data = x_test_poly)
        param_grid = {'C': [0.1, 1, 10, 100]}

        # fit GridSearchCV (implements k-fold cross validation with the param grid above)
        grid_search = GridSearchCV(LinearSVR(max_iter = 1000000), param_grid, cv=10)

        # fit the grid search object on the training data
        grid_search.fit(X=x_train_poly, y=y_train)

        mse = mean_squared_error(y_test, grid_search.predict(x_test_poly))
        mspe = mean_squared_percentage_error(y_test, grid_search.predict(x_test_poly))
        r2 = r2_score(y_test, grid_search.predict(x_test_poly))
        coefficients = grid_search.best_estimator_.coef_

        scores[i] = {'MSE: ':mse, 'MSPE: ': mspe, 'R2: ': r2, 'Coefficients: ': coefficients}

        # performance of the best found parameters on the test set
        print("MSE: ", mse)
        print("MSPE: ", mspe)
        print("R2 Score: ", r2)
        print("Coefficients: ", coefficients)

        print("Best parameters: ", grid_search.best_params_)
        print("Best cross-validation score: ", grid_search.best_score_, "\n")

    return scores

results = polynomial_k_fold_cv(degrees)
print(results)

Degree:  1
MSE:  0.2762273976243259
MSPE:  1.473895754432031
R2 Score:  0.8017246637119327
Coefficients:  [ 2.14552332  1.87338858  1.38800153  1.17589812  0.35605231 -0.99694024
  1.59704699  0.01656403 -0.40187092]
Best parameters:  {'C': 0.1}
Best cross-validation score:  0.7644415711589871 

Degree:  2
MSE:  0.23328083855839782
MSPE:  1.2041978257138908
R2 Score:  0.8325515965739406
Coefficients:  [-1.01848126 -0.73068832 -1.62130874  1.72283546  1.41020689  1.83010765
 -0.21149918  1.88332377  2.29786905  2.02011605 -1.44309457  0.72223828
 -2.29529248  1.50761675  3.14161763  2.66144071 -3.62150446 -0.55581427
  1.12396371  3.34085711  1.44304148  0.75823019 -1.50741462 -1.77078708
  2.40782402 -0.27734753 -0.60194222 -0.07338014 -2.33516365 -0.66742641
  1.25441433 -1.77697671  3.79593742  0.81070918 -0.20642541 -2.55602283
  0.6139565   0.16338    -0.38336361 -1.19309296 -0.57432194 -0.17391151
  0.55814036 -0.10980293  0.14555166 -0.31508345 -1.82958234 -0.22527815
  1.2559849



MSE:  0.18636951206058935
MSPE:  0.8998793693050432
R2 Score:  0.8662244295987165
Coefficients:  [ 4.78383868e-01 -6.55603465e-02  5.84064723e-02  2.97227279e-01
  1.37729526e+00  3.72717300e-01  1.37750932e+00  1.22320621e+00
  6.37737847e-01  1.28528641e+00 -1.34412154e+00  8.22656185e-01
  1.05930869e+00 -3.16868484e-01 -3.56760103e-01  1.49917113e+00
  1.13866393e-01 -8.66327841e-01 -8.53699816e-01 -2.58562948e-01
  8.44223301e-01  3.24133257e-01 -1.42985698e-01 -1.95646196e-01
  9.59185200e-01 -7.43477948e-01 -3.30758500e-01  3.74795150e-02
 -2.30240825e-01  1.92550293e-01  1.69921390e-01  6.87965862e-01
  8.89280615e-02  4.13041625e-01  9.06342144e-01 -9.34198161e-02
  8.11722660e-01  4.87003121e-01 -6.37932245e-01 -1.82333043e+00
  8.69042812e-01 -5.78587817e-01  2.84552379e-01  4.79564607e-01
  3.53206241e-01  1.10787706e-01  2.24530422e-01  3.73696908e-01
  2.33572341e-02  1.76492666e+00  2.51601861e-01  4.29095300e-01
  4.91219149e-01  8.50473317e-01 -1.75232309e-01  3.718849



MSE:  0.18358503833764295
MSPE:  0.8738614015524938
R2 Score:  0.8682231178843489
Coefficients:  [ 0.53083859  0.98149017  0.3151446  ... -0.04215272 -0.51958735
 -1.05810024]
Best parameters:  {'C': 10}
Best cross-validation score:  0.8124138681456377 

Degree:  6




MSE:  0.19006365440628628
MSPE:  0.9039532337590401
R2 Score:  0.8635727834470729
Coefficients:  [ 0.3975896   1.36172226  0.24576575 ... -0.20781626 -0.21862544
 -0.79210743]
Best parameters:  {'C': 10}
Best cross-validation score:  0.810189858510365 

{1: {'MSE: ': 0.2762273976243259, 'MSPE: ': 1.473895754432031, 'R2: ': 0.8017246637119327, 'Coefficients: ': array([ 2.14552332,  1.87338858,  1.38800153,  1.17589812,  0.35605231,
       -0.99694024,  1.59704699,  0.01656403, -0.40187092])}, 2: {'MSE: ': 0.23328083855839782, 'MSPE: ': 1.2041978257138908, 'R2: ': 0.8325515965739406, 'Coefficients: ': array([-1.01848126, -0.73068832, -1.62130874,  1.72283546,  1.41020689,
        1.83010765, -0.21149918,  1.88332377,  2.29786905,  2.02011605,
       -1.44309457,  0.72223828, -2.29529248,  1.50761675,  3.14161763,
        2.66144071, -3.62150446, -0.55581427,  1.12396371,  3.34085711,
        1.44304148,  0.75823019, -1.50741462, -1.77078708,  2.40782402,
       -0.27734753, -0.60194222, 

In [10]:
''' Find degree with best R^2 value '''

max_score = 0
max_degree = 0

for degree, stats in results.items():
    if stats['R2: '] > max_score:
        max_score = stats['R2: ']
        max_degree = degree

        
        
print(f'Degree with best score: ', max_degree)
print(f'Corresponding r-squared value: ', max_score)
print(f'Best C value: 10')

Degree with best score:  5
Corresponding r-squared value:  0.8682231178843489
Best C value: 10


In [14]:
''' Results on model with the best parameters on training data '''

poly = PolynomialFeatures(degree = 5)
x_train_poly = poly.fit_transform(x_train)

model = LinearSVR(C=10, max_iter = 1000000).fit(X=x_train_poly, y=y_train)
y_train_pred = model.predict(x_train_poly)

print('Training results of model with best degree = 5 and C = 10:\n')
print(f"R2: {r2_score(y_train, y_train_pred)}")
print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_train, y_train_pred)}")

Training results of model with best degree = 5 and C = 10:

R2: 0.8459950348185099
MSE: 0.1980452645892966
MSPE: 0.899387000896579


In [15]:
''' Results on model with the best parameters on testing data '''

x_test_poly = poly.fit_transform(x_test)
y_pred = model.predict(x_test_poly)

print('Testing results of model with best degree = 5 and C = 10:\n')
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

Testing results of model with best degree = 5 and C = 10:

R2: 0.8681865781312849
MSE: 0.18363594371542555
MSPE: 0.8750179900778241
