In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import sklearn.metrics

import statsmodels.formula.api as smf
import statsmodels.api as sm
import patsy

In [88]:
df = pd.read_pickle('final_data.pkl')
df

Unnamed: 0,Country,Country Code,Year,GDP per Capita (PPP),Budget Surplus or Deficit (%GDP),Gross Savings %GDP,Inflation Rate on Consumer Prices (Annual %),Tax Revenue %GDP,Health Expenditure (%GDP),Education Expenditure (%GDP),"Science, Tech, Innovation R&D Expenditure (%GDP)",Military Expenditure %GDP,Fossil Fuel (%),Hydroelectric Plants (%),Renewable Energy Sources (%),SPI Year,Social Progress Index
0,Angola,AGO,2013,7682.477158,0.900000,32.029882,8.777814,14.184763,37.049564,3.42132,0.032290,4.455239,36.800000,60.250000,1.587500,2014,38.51
1,Botswana,BWA,2013,14707.701699,-7.900000,36.664808,5.884607,25.609893,33.341854,9.63292,0.537280,2.056396,99.625000,0.000000,0.562500,2014,62.99
2,Burkina Faso,BFA,2013,1683.107411,4.800000,16.917850,0.533739,14.948642,28.408018,4.59384,0.509164,1.392049,81.725000,9.562500,8.737500,2014,42.94
3,Burundi,BDI,2013,740.984231,-1.500000,3.933888,7.937958,31.351000,33.161716,5.99536,0.210280,2.367638,13.800000,77.050000,10.400000,2014,38.10
6,Ethiopia,ETH,2013,1283.570926,-6.000000,28.334779,7.464022,8.764725,17.755827,4.49855,0.604740,0.806417,4.125000,83.562500,11.900000,2014,39.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,Malaysia,MYS,2018,28186.704943,0.300000,25.728034,0.884709,12.029692,21.869098,4.53477,1.436650,0.984002,78.000000,18.000000,4.000000,2019,74.17
553,Mongolia,MNG,2018,12196.750263,-11.133333,25.950801,6.812436,16.776785,30.284721,4.08578,0.102930,0.761588,87.000000,2.000000,11.000000,2019,65.60
554,New Zealand,NZL,2018,42810.837512,-4.800000,18.684779,1.598297,28.001078,35.873516,6.44277,1.365700,1.161388,23.266667,58.000000,20.000000,2019,88.93
556,Philippines,PHL,2018,8717.047981,0.000000,33.807051,5.211605,14.047533,19.911036,2.65295,0.164150,1.129324,68.900000,16.666667,15.733333,2019,63.40


In [89]:
df1 = df.copy()

# Initial Model (Original Data)

In [90]:
test_holdout = df1[df1['Year'] == 2018]
X_test, y_test = test_holdout.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index'], axis = 1), df1['Social Progress Index']

In [91]:
modeling_set = df1[df1['Year'] != 2018]
X, y = modeling_set.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index'], axis = 1), modeling_set['Social Progress Index']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [92]:
#set up the models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge and lasso models on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_ridge = Ridge(alpha=1)
lm_lasso = Lasso(alpha=0.15)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

#standardize the poly transforms
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_val_poly_scaled = scaler.transform(X_val_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

lm_poly_ridge = Ridge(alpha=1)
lm_poly_lasso = Lasso(alpha=1)

In [93]:
#validate

lm.fit(X_train, y_train)
print(f'Linear Regression train R^2: {lm.score(X_train, y_train):.3f}')
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}' + '\n')

lm_ridge.fit(X_train_scaled, y_train)
print(f'Ridge Regression train R^2: {lm_ridge.score(X_train_scaled, y_train):.3f}')
print(f'Ridge Regression val R^2: {lm_ridge.score(X_val_scaled, y_val):.3f}' + '\n')

lm_lasso.fit(X_train_scaled, y_train)
print(f'Lasso Regression train R^2: {lm_lasso.score(X_train_scaled, y_train):.3f}')
print(f'Lasso Regression val R^2: {lm_lasso.score(X_val_scaled, y_val):.3f}' + '\n')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression train R^2: {lm_poly.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}' + '\n')

lm_poly_ridge.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Ridge regression train R^2: {lm_poly_ridge.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial+Ridge regression val R^2: {lm_poly_ridge.score(X_val_poly, y_val):.3f}' + '\n')

lm_poly_lasso.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Lasso regression train R^2: {lm_poly_lasso.score(X_train_poly_scaled, y_train):.3f}')
print(f'Degree 2 polynomial+Lasso regression val R^2: {lm_poly_lasso.score(X_val_poly_scaled, y_val):.3f}' + '\n')

Linear Regression train R^2: 0.738
Linear Regression val R^2: 0.677

Ridge Regression train R^2: 0.738
Ridge Regression val R^2: 0.677

Lasso Regression train R^2: 0.737
Lasso Regression val R^2: 0.675

Degree 2 polynomial regression train R^2: 0.960
Degree 2 polynomial regression val R^2: 0.858

Degree 2 polynomial+Ridge regression train R^2: 0.958
Degree 2 polynomial+Ridge regression val R^2: 0.866

Degree 2 polynomial+Lasso regression train R^2: -1.594
Degree 2 polynomial+Lasso regression val R^2: -1.931



  return linalg.solve(A, Xy, sym_pos=True,
  model = cd_fast.enet_coordinate_descent(


In [94]:
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
alphavec = 10**np.linspace(-2,2,200)

lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-02, 1.04737090e-02, 1.09698580e-02, 1.14895100e-02,
       1.20337784e-02, 1.26038293e-02, 1.32008840e-02, 1.38262217e-02,
       1.44811823e-02, 1.51671689e-02, 1.58856513e-02, 1.66381689e-02,
       1.74263339e-02, 1.82518349e-02, 1.91164408e-02, 2.00220037e-02,
       2.09704640e-02, 2.19638537e-02, 2.30043012e-02, 2.40940356e-02,
       2.52353917e-02, 2.64308149e-0...
       3.44896226e+01, 3.61234270e+01, 3.78346262e+01, 3.96268864e+01,
       4.15040476e+01, 4.34701316e+01, 4.55293507e+01, 4.76861170e+01,
       4.99450512e+01, 5.23109931e+01, 5.47890118e+01, 5.73844165e+01,
       6.01027678e+01, 6.29498899e+01, 6.59318827e+01, 6.90551352e+01,
       7.23263390e+01, 7.57525026e+01, 7.93409667e+01, 8.30994195e+01,
       8.70359136e+01, 9.11588830e+01, 9.54771611e+01, 1.00000000e+02]),
        cv=5)

In [95]:
lasso_model.alpha_

0.2552908068239518

In [96]:
lm_lasso.coef_.tolist()

[7.891932009426851,
 0.0,
 -1.3403098958658275,
 -0.36580845884185054,
 -0.0,
 3.906026414842399,
 0.04809051553289415,
 2.0183961232642575,
 -2.9252625300333848,
 0.8688625017764914,
 -0.6703974835746682,
 0.6038247948603778]

In [97]:
X.columns.tolist()

['GDP per Capita (PPP)',
 'Budget Surplus or Deficit (%GDP)',
 'Gross Savings %GDP',
 'Inflation Rate on Consumer Prices (Annual %)',
 'Tax Revenue %GDP',
 'Health Expenditure (%GDP)',
 'Education Expenditure (%GDP)',
 'Science, Tech, Innovation R&D Expenditure (%GDP)',
 'Military Expenditure %GDP',
 'Fossil Fuel (%)',
 'Hydroelectric Plants (%)',
 'Renewable Energy Sources (%)']

# Cross validation with Base Models

In [98]:
from sklearn.model_selection import KFold

#this helps with the way kf will generate indices below
X, y = np.array(X), np.array(y)

In [99]:
#run the CV
X, y = np.array(X), np.array(y)
kf = KFold(n_splits=5, shuffle=True)
cv_lm_train_r2s, cv_lm_reg_train_r2s, cv_lm_lasso_train_r2s, cv_lm_poly_train_r2s, cv_lm_poly_reg_train_r2s, cv_lm_poly_lasso_train_r2s = [], [], [], [], [], []
cv_lm_r2s, cv_lm_reg_r2s, cv_lm_lasso_r2s, cv_lm_poly_r2s, cv_lm_poly_reg_r2s, cv_lm_poly_lasso_r2s = [], [], [], [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #define models
    #simple linear regression
    lm = LinearRegression()
    lm_reg = Ridge(alpha=1)
    lm_lasso = Lasso(alpha=1)
    lm_poly = LinearRegression(fit_intercept=True)
    lm_poly_lasso = Lasso(alpha=1)

    #fit models
    lm.fit(X_train, y_train)
    cv_lm_train_r2s.append(lm.score(X_train, y_train))
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge with feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #Lasso with scaled features
    lm_lasso.fit(X_train_scaled, y_train)
    cv_lm_lasso_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_lasso_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #poly
    poly = PolynomialFeatures(degree=2) 
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    lm_poly.fit(X_train_poly, y_train)
    cv_lm_poly_train_r2s.append(lm_poly.score(X_train_poly, y_train))
    cv_lm_poly_r2s.append(lm_poly.score(X_val_poly, y_val))
    
    #poly + lasso (w scaling of poly terms)
    X_train_poly_scaled = scaler.fit_transform(X_train_poly)
    X_val_poly_scaled = scaler.transform(X_val_poly)
    
    lm_reg.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_reg_train_r2s.append(lm_reg.score(X_train_poly_scaled, y_train))
    cv_lm_poly_reg_r2s.append(lm_reg.score(X_val_poly_scaled, y_val))
    
    lm_lasso.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_lasso_train_r2s.append(lm_lasso.score(X_train_poly_scaled, y_train))
    cv_lm_poly_lasso_r2s.append(lm_lasso.score(X_val_poly_scaled, y_val))
    
    
# print('Simple regression scores (train): ', cv_lm_train_r2s)
# print('Simple regression scores: ', cv_lm_r2s, '\n')
# print('Ridge scores (train): ', cv_lm_reg_train_r2s)
# print('Ridge scores: ', cv_lm_reg_r2s, '\n')
# print('Poly scores (train): ', cv_lm_poly_train_r2s)
# print('Poly scores: ', cv_lm_poly_r2s, '\n')


print(f'Simple mean cv (train) r^2: {np.mean(cv_lm_train_r2s):.3f} +- {np.std(cv_lm_train_r2s):.3f}')
print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}' + '\n')

print(f'Ridge mean cv (train) r^2: {np.mean(cv_lm_reg_train_r2s):.3f} +- {np.std(cv_lm_reg_train_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}'+ '\n')

print(f'Lasso mean cv (train) r^2: {np.mean(cv_lm_lasso_train_r2s):.3f} +- {np.std(cv_lm_lasso_train_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lm_lasso_r2s):.3f} +- {np.std(cv_lm_lasso_r2s):.3f}'+ '\n')

print(f'Poly mean cv (train) r^2: {np.mean(cv_lm_poly_train_r2s):.3f} +- {np.std(cv_lm_poly_train_r2s):.3f}')
print(f'Poly mean cv r^2: {np.mean(cv_lm_poly_r2s):.3f} +- {np.std(cv_lm_poly_r2s):.3f}'+ '\n')

print(f'Poly + Ridge mean cv (train) r^2: {np.mean(cv_lm_poly_reg_train_r2s):.3f} +- {np.std(cv_lm_poly_reg_train_r2s):.3f}')
print(f'Poly + Ridge mean cv r^2: {np.mean(cv_lm_poly_reg_r2s):.3f} +- {np.std(cv_lm_poly_reg_r2s):.3f}'+ '\n')

print(f'Poly + Lasso mean cv (train) r^2: {np.mean(cv_lm_poly_lasso_train_r2s):.3f} +- {np.std(cv_lm_poly_lasso_train_r2s):.3f}')
print(f'Poly + Lasso mean cv r^2: {np.mean(cv_lm_poly_lasso_r2s):.3f} +- {np.std(cv_lm_poly_lasso_r2s):.3f}'+ '\n')

Simple mean cv (train) r^2: 0.731 +- 0.009
Simple mean cv r^2: 0.712 +- 0.044

Ridge mean cv (train) r^2: 0.730 +- 0.009
Ridge mean cv r^2: 0.713 +- 0.044

Lasso mean cv (train) r^2: 0.730 +- 0.009
Lasso mean cv r^2: 0.713 +- 0.044

Poly mean cv (train) r^2: 0.822 +- 0.116
Poly mean cv r^2: 0.431 +- 0.588

Poly + Ridge mean cv (train) r^2: 0.938 +- 0.003
Poly + Ridge mean cv r^2: 0.848 +- 0.041

Poly + Lasso mean cv (train) r^2: 0.757 +- 0.005
Poly + Lasso mean cv r^2: 0.738 +- 0.033



# Taking Log of GDP per Capita and SPI

In [110]:
df1['log_GDP_per_capita'] = np.log(df1['GDP per Capita (PPP)'])
df1['log_Social_Progress_Index'] = np.log(df1['Social Progress Index'])

In [111]:
test_holdout = df1[df1['Year'] == 2018]
X_test, y_test = test_holdout.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index', 'log_Social_Progress_Index'], axis = 1), test_holdout['log_Social_Progress_Index']

In [112]:
df1

Unnamed: 0,Country,Country Code,Year,GDP per Capita (PPP),Budget Surplus or Deficit (%GDP),Gross Savings %GDP,Inflation Rate on Consumer Prices (Annual %),Tax Revenue %GDP,Health Expenditure (%GDP),Education Expenditure (%GDP),"Science, Tech, Innovation R&D Expenditure (%GDP)",Military Expenditure %GDP,Fossil Fuel (%),Hydroelectric Plants (%),Renewable Energy Sources (%),SPI Year,Social Progress Index,log_GDP_per_capita,log_Social_Progress_Index
0,Angola,AGO,2013,7682.477158,0.900000,32.029882,8.777814,14.184763,37.049564,3.42132,0.032290,4.455239,36.800000,60.250000,1.587500,2014,38.51,8.946697,3.650918
1,Botswana,BWA,2013,14707.701699,-7.900000,36.664808,5.884607,25.609893,33.341854,9.63292,0.537280,2.056396,99.625000,0.000000,0.562500,2014,62.99,9.596127,4.142976
2,Burkina Faso,BFA,2013,1683.107411,4.800000,16.917850,0.533739,14.948642,28.408018,4.59384,0.509164,1.392049,81.725000,9.562500,8.737500,2014,42.94,7.428397,3.759804
3,Burundi,BDI,2013,740.984231,-1.500000,3.933888,7.937958,31.351000,33.161716,5.99536,0.210280,2.367638,13.800000,77.050000,10.400000,2014,38.10,6.607979,3.640214
6,Ethiopia,ETH,2013,1283.570926,-6.000000,28.334779,7.464022,8.764725,17.755827,4.49855,0.604740,0.806417,4.125000,83.562500,11.900000,2014,39.31,7.157401,3.671479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,Malaysia,MYS,2018,28186.704943,0.300000,25.728034,0.884709,12.029692,21.869098,4.53477,1.436650,0.984002,78.000000,18.000000,4.000000,2019,74.17,10.246606,4.306360
553,Mongolia,MNG,2018,12196.750263,-11.133333,25.950801,6.812436,16.776785,30.284721,4.08578,0.102930,0.761588,87.000000,2.000000,11.000000,2019,65.60,9.408925,4.183576
554,New Zealand,NZL,2018,42810.837512,-4.800000,18.684779,1.598297,28.001078,35.873516,6.44277,1.365700,1.161388,23.266667,58.000000,20.000000,2019,88.93,10.664547,4.487850
556,Philippines,PHL,2018,8717.047981,0.000000,33.807051,5.211605,14.047533,19.911036,2.65295,0.164150,1.129324,68.900000,16.666667,15.733333,2019,63.40,9.073036,4.149464


In [113]:
modeling_set = df1[df1['Year'] != 2018]
X, y = modeling_set.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index', 'log_Social_Progress_Index'], axis = 1), modeling_set['log_Social_Progress_Index']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [114]:
#set up the models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge and lasso models on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_ridge = Ridge(alpha=1)
lm_lasso = Lasso(alpha=1)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

#standardize the poly transforms
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_val_poly_scaled = scaler.transform(X_val_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

In [115]:
#validate

lm.fit(X_train, y_train)
print(f'Linear Regression train R^2: {lm.score(X_train, y_train):.3f}')
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}' + '\n')

lm_ridge.fit(X_train_scaled, y_train)
print(f'Ridge Regression train R^2: {lm_ridge.score(X_train_scaled, y_train):.3f}')
print(f'Ridge Regression val R^2: {lm_ridge.score(X_val_scaled, y_val):.3f}' + '\n')

lm_lasso.fit(X_train_scaled, y_train)
print(f'Lasso Regression train R^2: {lm_lasso.score(X_train_scaled, y_train):.3f}')
print(f'Lasso Regression val R^2: {lm_lasso.score(X_val_scaled, y_val):.3f}' + '\n')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression train R^2: {lm_poly.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}' + '\n')

lm_ridge.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Ridge regression train R^2: {lm_ridge.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial+Ridge regression val R^2: {lm_ridge.score(X_val_poly, y_val):.3f}' + '\n')

lm_lasso.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Lasso regression train R^2: {lm_lasso.score(X_train_poly_scaled, y_train):.3f}')
print(f'Degree 2 polynomial+Lasso regression val R^2: {lm_lasso.score(X_val_poly_scaled, y_val):.3f}' + '\n')

Linear Regression train R^2: 0.922
Linear Regression val R^2: 0.887

Ridge Regression train R^2: 0.922
Ridge Regression val R^2: 0.887

Lasso Regression train R^2: 0.000
Lasso Regression val R^2: -0.023

Degree 2 polynomial regression train R^2: 0.980
Degree 2 polynomial regression val R^2: 0.848

Degree 2 polynomial+Ridge regression train R^2: 0.979
Degree 2 polynomial+Ridge regression val R^2: 0.850

Degree 2 polynomial+Lasso regression train R^2: -1.484
Degree 2 polynomial+Lasso regression val R^2: -0.847



  return linalg.solve(A, Xy, sym_pos=True,
  model = cd_fast.enet_coordinate_descent(


In [124]:
X, y = np.array(X), np.array(y)
#run the CV
X, y = np.array(X), np.array(y)
kf = KFold(n_splits=5, shuffle=True)
cv_lm_train_r2s, cv_lm_reg_train_r2s, cv_lm_lasso_train_r2s, cv_lm_poly_train_r2s, cv_lm_poly_reg_train_r2s, cv_lm_poly_lasso_train_r2s = [], [], [], [], [], []
cv_lm_r2s, cv_lm_reg_r2s, cv_lm_lasso_r2s, cv_lm_poly_r2s, cv_lm_poly_reg_r2s, cv_lm_poly_lasso_r2s = [], [], [], [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #define models
    #simple linear regression
    lm = LinearRegression()
    lm_reg = Ridge(alpha=1)
    lm_lasso = Lasso(alpha=1000)
    lm_poly = LinearRegression(fit_intercept=True)
    lm_poly_lasso = Lasso(alpha=1)

    #fit models
    lm.fit(X_train, y_train)
    cv_lm_train_r2s.append(lm.score(X_train, y_train))
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge with feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #Lasso with scaled features
    lm_lasso.fit(X_train_scaled, y_train)
    cv_lm_lasso_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_lasso_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #poly
    poly = PolynomialFeatures(degree=2) 
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    lm_poly.fit(X_train_poly, y_train)
    cv_lm_poly_train_r2s.append(lm_poly.score(X_train_poly, y_train))
    cv_lm_poly_r2s.append(lm_poly.score(X_val_poly, y_val))
    
    #poly + lasso (w scaling of poly terms)
    X_train_poly_scaled = scaler.fit_transform(X_train_poly)
    X_val_poly_scaled = scaler.transform(X_val_poly)
    
    lm_reg.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_reg_train_r2s.append(lm_reg.score(X_train_poly_scaled, y_train))
    cv_lm_poly_reg_r2s.append(lm_reg.score(X_val_poly_scaled, y_val))
    
    lm_lasso.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_lasso_train_r2s.append(lm_lasso.score(X_train_poly_scaled, y_train))
    cv_lm_poly_lasso_r2s.append(lm_lasso.score(X_val_poly_scaled, y_val))
    
    
# print('Simple regression scores (train): ', cv_lm_train_r2s)
# print('Simple regression scores: ', cv_lm_r2s, '\n')
# print('Ridge scores (train): ', cv_lm_reg_train_r2s)
# print('Ridge scores: ', cv_lm_reg_r2s, '\n')
# print('Poly scores (train): ', cv_lm_poly_train_r2s)
# print('Poly scores: ', cv_lm_poly_r2s, '\n')


print(f'Simple mean cv (train) r^2: {np.mean(cv_lm_train_r2s):.3f} +- {np.std(cv_lm_train_r2s):.3f}')
print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}' + '\n')

print(f'Ridge mean cv (train) r^2: {np.mean(cv_lm_reg_train_r2s):.3f} +- {np.std(cv_lm_reg_train_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}'+ '\n')

print(f'Lasso mean cv (train) r^2: {np.mean(cv_lm_lasso_train_r2s):.3f} +- {np.std(cv_lm_lasso_train_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lm_lasso_r2s):.3f} +- {np.std(cv_lm_lasso_r2s):.3f}'+ '\n')

print(f'Poly mean cv (train) r^2: {np.mean(cv_lm_poly_train_r2s):.3f} +- {np.std(cv_lm_poly_train_r2s):.3f}')
print(f'Poly mean cv r^2: {np.mean(cv_lm_poly_r2s):.3f} +- {np.std(cv_lm_poly_r2s):.3f}'+ '\n')

print(f'Poly + Ridge mean cv (train) r^2: {np.mean(cv_lm_poly_reg_train_r2s):.3f} +- {np.std(cv_lm_poly_reg_train_r2s):.3f}')
print(f'Poly + Ridge mean cv r^2: {np.mean(cv_lm_poly_reg_r2s):.3f} +- {np.std(cv_lm_poly_reg_r2s):.3f}'+ '\n')

print(f'Poly + Lasso mean cv (train) r^2: {np.mean(cv_lm_poly_lasso_train_r2s):.3f} +- {np.std(cv_lm_poly_lasso_train_r2s):.3f}')
print(f'Poly + Lasso mean cv r^2: {np.mean(cv_lm_poly_lasso_r2s):.3f} +- {np.std(cv_lm_poly_lasso_r2s):.3f}'+ '\n')

Simple mean cv (train) r^2: 0.916 +- 0.005
Simple mean cv r^2: 0.906 +- 0.019

Ridge mean cv (train) r^2: 0.916 +- 0.005
Ridge mean cv r^2: 0.906 +- 0.019

Lasso mean cv (train) r^2: 0.916 +- 0.005
Lasso mean cv r^2: 0.906 +- 0.019

Poly mean cv (train) r^2: 0.950 +- 0.033
Poly mean cv r^2: 0.867 +- 0.044

Poly + Ridge mean cv (train) r^2: 0.966 +- 0.003
Poly + Ridge mean cv r^2: 0.892 +- 0.066

Poly + Lasso mean cv (train) r^2: 0.000 +- 0.000
Poly + Lasso mean cv r^2: -0.030 +- 0.034



In [119]:
from sklearn.metrics import mean_squared_error, r2_score
lr = LinearRegression()
lr.fit(X, y)
pred = lr.predict(X_test)
r2_score(y_test, pred)

0.9234291111686543

In [None]:
test_holdout['Social Progress Index'].head()

In [None]:
y_test

# Modeling with transformed variables

In [125]:
df_logs = df.copy()

In [139]:
log_features = ['GDP per Capita (PPP)', 'Inflation Rate on Consumer Prices (Annual %)', 'Tax Revenue %GDP', 'Science, Tech, Innovation R&D Expenditure (%GDP)',
               'Military Expenditure %GDP', 'Hydroelectric Plants (%)', 'Renewable Energy Sources (%)']

In [140]:
def logtransform(columnlist):
    '''
    input: list of features
    output: new columns with log of feature
    '''
    for elem in columnlist:
        df_logs[elem] = df_logs[elem].apply(np.log1p)
    return df_logs

In [141]:
logtransform(log_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_logs[elem] = df_logs[elem].apply(np.log1p)


Unnamed: 0,Country,Country Code,Year,GDP per Capita (PPP),Budget Surplus or Deficit (%GDP),Gross Savings %GDP,Inflation Rate on Consumer Prices (Annual %),Tax Revenue %GDP,Health Expenditure (%GDP),Education Expenditure (%GDP),"Science, Tech, Innovation R&D Expenditure (%GDP)",Military Expenditure %GDP,Fossil Fuel (%),Hydroelectric Plants (%),Renewable Energy Sources (%),SPI Year,Social Progress Index
0,Angola,AGO,2013,2.297254,0.900000,32.029882,1.187879,1.313802,37.049564,3.42132,0.031285,0.991983,36.800000,2.170434,0.815347,2014,38.51
1,Botswana,BWA,2013,2.360495,-7.900000,36.664808,1.074759,1.454253,33.341854,9.63292,0.357685,0.750112,99.625000,0.000000,0.559616,2014,62.99
2,Burkina Faso,BFA,2013,2.131677,4.800000,16.917850,0.356071,1.326909,28.408018,4.59384,0.344693,0.627088,81.725000,1.409114,1.375215,2014,42.94
3,Burundi,BDI,2013,2.029375,-1.500000,3.933888,1.160117,1.498874,33.161716,5.99536,0.174669,0.794896,13.800000,2.280116,1.440996,2014,38.10
6,Ethiopia,ETH,2013,2.099021,-6.000000,28.334779,1.142892,1.187470,17.755827,4.49855,0.387275,0.464580,4.125000,2.316612,1.492823,2014,39.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,Malaysia,MYS,2018,2.420070,0.300000,25.728034,0.490892,1.271790,21.869098,4.53477,0.636907,0.521834,78.000000,1.656825,1.098612,2019,74.17
553,Mongolia,MNG,2018,2.342671,-11.133333,25.950801,1.117014,1.355292,30.284721,4.08578,0.093463,0.448662,87.000000,0.881374,1.462474,2019,65.60
554,New Zealand,NZL,2018,2.456556,-4.800000,18.684779,0.670317,1.474153,35.873516,6.44277,0.621154,0.571404,23.266667,2.153595,1.699669,2019,88.93
556,Philippines,PHL,2018,2.309874,0.000000,33.807051,1.039011,1.311359,19.911036,2.65295,0.141492,0.562927,68.900000,1.625800,1.602721,2019,63.40


In [130]:
df_logs['Hydroelectric Plants (%)'] = np.sqrt(df['Hydroelectric Plants (%)'])
df_logs['Renewable Energy Sources (%)'] = np.sqrt(df['Renewable Energy Sources (%)'])

In [131]:
df_logs

Unnamed: 0,Country,Country Code,Year,GDP per Capita (PPP),Budget Surplus or Deficit (%GDP),Gross Savings %GDP,Inflation Rate on Consumer Prices (Annual %),Tax Revenue %GDP,Health Expenditure (%GDP),Education Expenditure (%GDP),"Science, Tech, Innovation R&D Expenditure (%GDP)",Military Expenditure %GDP,Fossil Fuel (%),Hydroelectric Plants (%),Renewable Energy Sources (%),SPI Year,Social Progress Index
0,Angola,AGO,2013,8.946827,0.900000,32.029882,2.280116,2.720292,37.049564,3.42132,0.031780,1.696576,36.800000,7.762087,1.259960,2014,38.51
1,Botswana,BWA,2013,9.596195,-7.900000,36.664808,1.929288,3.281283,33.341854,9.63292,0.430015,1.117236,99.625000,0.000000,0.750000,2014,62.99
2,Burkina Faso,BFA,2013,7.428991,4.800000,16.917850,0.427708,2.769374,28.408018,4.59384,0.411556,0.872150,81.725000,3.092329,2.955926,2014,42.94
3,Burundi,BDI,2013,6.609328,-1.500000,3.933888,2.190307,3.476645,33.161716,5.99536,0.190852,1.214212,13.800000,8.777813,3.224903,2014,38.10
6,Ethiopia,ETH,2013,7.158180,-6.000000,28.334779,2.135824,2.278776,17.755827,4.49855,0.472962,0.591345,4.125000,9.141253,3.449638,2014,39.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,Malaysia,MYS,2018,10.246641,0.300000,25.728034,0.633774,2.567231,21.869098,4.53477,0.890624,0.685116,78.000000,4.242641,2.000000,2019,74.17
553,Mongolia,MNG,2018,9.409007,-11.133333,25.950801,2.055717,2.877893,30.284721,4.08578,0.097970,0.566216,87.000000,1.414214,3.316625,2019,65.60
554,New Zealand,NZL,2018,10.664570,-4.800000,18.684779,0.954856,3.367333,35.873516,6.44277,0.861074,0.770751,23.266667,7.615773,4.472136,2019,88.93
556,Philippines,PHL,2018,9.073151,0.000000,33.807051,1.826419,2.711214,19.911036,2.65295,0.151991,0.755805,68.900000,4.082483,3.966527,2019,63.40


In [None]:
q_low = df["Budget Surplus or Deficit (%GDP)"].quantile(0.01)
q_hi  = df["Budget Surplus or Deficit (%GDP)"].quantile(0.99)

df_filtered = df[(df["Budget Surplus or Deficit (%GDP)"] < q_hi) & (df["Budget Surplus or Deficit (%GDP)"] > q_low)]

In [None]:
sns.distplot((df_filtered['Inflation Rate on Consumer Prices (Annual %)']))

In [None]:
sns.distplot(np.log(df_filtered['Tax Revenue %GDP']))

In [None]:
sns.distplot(np.log(df_filtered['Science, Tech, Innovation R&D Expenditure (%GDP)']))

In [None]:
sns.distplot(np.sqrt(df_filtered['Hydroelectric Plants (%)']))

In [None]:
sns.distplot(np.sqrt(df_filtered['Renewable Energy Sources (%)']))

In [143]:
df_logs = df_logs[~df_logs.isin([np.nan, np.inf, -np.inf]).any(1)]


In [144]:
test_holdout = df_logs[df_logs['Year'] == 2018]
X_test, y_test = test_holdout.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index'], axis = 1), df_logs['Social Progress Index']

In [145]:
modeling_set = df_logs[df_logs['Year'] != 2018]
X, y = modeling_set.drop(columns = ['Country', 'Country Code', 'Year', 'SPI Year', 'Social Progress Index'], axis = 1), modeling_set['Social Progress Index']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [148]:
#set up the models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge and lasso models on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_ridge = Ridge(alpha=1)
lm_lasso = Lasso(alpha=1)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

#standardize the poly transforms
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_val_poly_scaled = scaler.transform(X_val_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

lm_poly_ridge = Ridge(alpha=1)
lm_poly_lasso = Lasso(alpha=1)

In [149]:
#validate

lm.fit(X_train, y_train)
print(f'Linear Regression train R^2: {lm.score(X_train, y_train):.3f}')
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}' + '\n')

lm_ridge.fit(X_train_scaled, y_train)
print(f'Ridge Regression train R^2: {lm_ridge.score(X_train_scaled, y_train):.3f}')
print(f'Ridge Regression val R^2: {lm_ridge.score(X_val_scaled, y_val):.3f}' + '\n')

lm_lasso.fit(X_train_scaled, y_train)
print(f'Lasso Regression train R^2: {lm_lasso.score(X_train_scaled, y_train):.3f}')
print(f'Lasso Regression val R^2: {lm_lasso.score(X_val_scaled, y_val):.3f}' + '\n')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression train R^2: {lm_poly.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}' + '\n')

lm_poly_ridge.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Ridge regression train R^2: {lm_poly_ridge.score(X_train_poly, y_train):.3f}')
print(f'Degree 2 polynomial+Ridge regression val R^2: {lm_poly_ridge.score(X_val_poly, y_val):.3f}' + '\n')

lm_poly_lasso.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial+Lasso regression train R^2: {lm_poly_lasso.score(X_train_poly_scaled, y_train):.3f}')
print(f'Degree 2 polynomial+Lasso regression val R^2: {lm_poly_lasso.score(X_val_poly_scaled, y_val):.3f}' + '\n')

Linear Regression train R^2: 0.934
Linear Regression val R^2: 0.907

Ridge Regression train R^2: 0.933
Ridge Regression val R^2: 0.907

Lasso Regression train R^2: 0.908
Lasso Regression val R^2: 0.850

Degree 2 polynomial regression train R^2: 0.976
Degree 2 polynomial regression val R^2: 0.905

Degree 2 polynomial+Ridge regression train R^2: 0.967
Degree 2 polynomial+Ridge regression val R^2: 0.904

Degree 2 polynomial+Lasso regression train R^2: -36.432
Degree 2 polynomial+Lasso regression val R^2: -27.282



  model = cd_fast.enet_coordinate_descent(


In [55]:
lm_lasso.coef_

array([11.00683084,  0.        , -0.841031  , -0.        ,  0.        ,
        0.44157817,  0.        ,  1.80480416, -0.79660008, -0.        ,
        0.        ,  0.50832812])

In [60]:
X_test.columns.tolist()

['GDP per Capita (PPP)',
 'Budget Surplus or Deficit (%GDP)',
 'Gross Savings %GDP',
 'Inflation Rate on Consumer Prices (Annual %)',
 'Tax Revenue %GDP',
 'Health Expenditure (%GDP)',
 'Education Expenditure (%GDP)',
 'Science, Tech, Innovation R&D Expenditure (%GDP)',
 'Military Expenditure %GDP',
 'Fossil Fuel (%)',
 'Hydroelectric Plants (%)',
 'Renewable Energy Sources (%)']

In [61]:
lm_ridge.coef_

array([11.96190251,  0.03137076, -2.17712399, -0.57026937,  0.458494  ,
        0.19412156,  0.55274568,  2.35152604, -1.46622731,  0.36288048,
        0.91601401,  0.77454114])

In [152]:
X, y = np.array(X), np.array(y)
#run the CV
X, y = np.array(X), np.array(y)
kf = KFold(n_splits=5, shuffle=True)
cv_lm_train_r2s, cv_lm_reg_train_r2s, cv_lm_lasso_train_r2s, cv_lm_poly_train_r2s, cv_lm_poly_ridge_train_r2s, cv_lm_poly_lasso_train_r2s = [], [], [], [], [], []
cv_lm_r2s, cv_lm_reg_r2s, cv_lm_lasso_r2s, cv_lm_poly_r2s, cv_lm_poly_ridge_r2s, cv_lm_poly_lasso_r2s = [], [], [], [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #define models
    #simple linear regression
    lm = LinearRegression()
    lm_reg = Ridge(alpha=1)
    lm_lasso = Lasso(alpha=1)
    lm_poly = LinearRegression(fit_intercept=True)
    lm_poly_lasso = Lasso(alpha=1)
    lm_poly_ridge = Lasso(alpha=1)

    #fit models
    lm.fit(X_train, y_train)
    cv_lm_train_r2s.append(lm.score(X_train, y_train))
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge with feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #Lasso with scaled features
    lm_lasso.fit(X_train_scaled, y_train)
    cv_lm_lasso_train_r2s.append(lm_reg.score(X_train_scaled, y_train))
    cv_lm_lasso_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #poly
    poly = PolynomialFeatures(degree=2) 
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    lm_poly.fit(X_train_poly, y_train)
    cv_lm_poly_train_r2s.append(lm_poly.score(X_train_poly, y_train))
    cv_lm_poly_r2s.append(lm_poly.score(X_val_poly, y_val))
    
    #poly + lasso (w scaling of poly terms)
    X_train_poly_scaled = scaler.fit_transform(X_train_poly)
    X_val_poly_scaled = scaler.transform(X_val_poly)
    
    lm_poly_ridge.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_ridge_train_r2s.append(lm_poly_ridge.score(X_train_poly_scaled, y_train))
    cv_lm_poly_ridge_r2s.append(lm_poly_ridge.score(X_val_poly_scaled, y_val))
    
    lm_poly_lasso.fit(X_train_poly_scaled, y_train)
    cv_lm_poly_lasso_train_r2s.append(lm_poly_lasso.score(X_train_poly_scaled, y_train))
    cv_lm_poly_lasso_r2s.append(lm_poly_lasso.score(X_val_poly_scaled, y_val))
    
    
# print('Simple regression scores (train): ', cv_lm_train_r2s)
# print('Simple regression scores: ', cv_lm_r2s, '\n')
# print('Ridge scores (train): ', cv_lm_reg_train_r2s)
# print('Ridge scores: ', cv_lm_reg_r2s, '\n')
# print('Poly scores (train): ', cv_lm_poly_train_r2s)
# print('Poly scores: ', cv_lm_poly_r2s, '\n')


print(f'Simple mean cv (train) r^2: {np.mean(cv_lm_train_r2s):.3f} +- {np.std(cv_lm_train_r2s):.3f}')
print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}' + '\n')

print(f'Ridge mean cv (train) r^2: {np.mean(cv_lm_reg_train_r2s):.3f} +- {np.std(cv_lm_reg_train_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}'+ '\n')

print(f'Lasso mean cv (train) r^2: {np.mean(cv_lm_lasso_train_r2s):.3f} +- {np.std(cv_lm_lasso_train_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lm_lasso_r2s):.3f} +- {np.std(cv_lm_lasso_r2s):.3f}'+ '\n')

print(f'Poly mean cv (train) r^2: {np.mean(cv_lm_poly_train_r2s):.3f} +- {np.std(cv_lm_poly_train_r2s):.3f}')
print(f'Poly mean cv r^2: {np.mean(cv_lm_poly_r2s):.3f} +- {np.std(cv_lm_poly_r2s):.3f}'+ '\n')

print(f'Poly + Ridge mean cv (train) r^2: {np.mean(cv_lm_poly_reg_train_r2s):.3f} +- {np.std(cv_lm_poly_reg_train_r2s):.3f}')
print(f'Poly + Ridge mean cv r^2: {np.mean(cv_lm_poly_reg_r2s):.3f} +- {np.std(cv_lm_poly_reg_r2s):.3f}'+ '\n')

print(f'Poly + Lasso mean cv (train) r^2: {np.mean(cv_lm_poly_lasso_train_r2s):.3f} +- {np.std(cv_lm_poly_lasso_train_r2s):.3f}')
print(f'Poly + Lasso mean cv r^2: {np.mean(cv_lm_poly_lasso_r2s):.3f} +- {np.std(cv_lm_poly_lasso_r2s):.3f}'+ '\n')

Simple mean cv (train) r^2: 0.929 +- 0.002
Simple mean cv r^2: 0.920 +- 0.011

Ridge mean cv (train) r^2: 0.929 +- 0.002
Ridge mean cv r^2: 0.920 +- 0.011

Lasso mean cv (train) r^2: 0.929 +- 0.002
Lasso mean cv r^2: 0.920 +- 0.011

Poly mean cv (train) r^2: 0.900 +- 0.061
Poly mean cv r^2: 0.496 +- 0.720

Poly + Ridge mean cv (train) r^2: nan +- nan
Poly + Ridge mean cv r^2: nan +- nan

Poly + Lasso mean cv (train) r^2: 0.916 +- 0.003
Poly + Lasso mean cv r^2: 0.907 +- 0.012



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


In [None]:
lm = LinearRegression()
lm_reg = Ridge(alpha=1)
lm_lasso = Lasso(alpha=1)
lm_poly = LinearRegression(fit_intercept=True)
lm_poly_lasso = Lasso(alpha=1)

poly = PolynomialFeatures(degree=2) 
X_test_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_val)

X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_val_poly_scaled = scaler.transform(X_val_poly)

lm_reg.fit(X_train_poly_scaled, y_train)
cv_lm_poly_reg_train_r2s.append(lm_reg.score(X_train_poly_scaled, y_train))
cv_lm_poly_reg_r2s.append(lm_reg.score(X_val_poly_scaled, y_val))
