In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [3]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [5]:
x_train

array([[0.5260864 , 0.8129217 , 0.8132486 , ..., 0.81144178, 0.35112527,
        0.75820857],
       [0.22822541, 0.81766069, 0.67455273, ..., 0.70999801, 0.16615444,
        0.7923069 ],
       [0.83489252, 0.95492065, 0.93641882, ..., 0.76281816, 0.17171657,
        0.60901547],
       ...,
       [0.69896376, 0.9002558 , 0.8698803 , ..., 0.50173569, 0.23276807,
        0.4479163 ],
       [0.79622812, 0.91720337, 0.92385034, ..., 0.59189552, 0.29536614,
        0.3319163 ],
       [0.68009092, 0.77366722, 0.69761944, ..., 0.66504985, 0.27272177,
        0.69006956]])

In [6]:
basis_expansion(x_train,1)

array([[0.5260864 , 0.8129217 , 0.8132486 , ..., 0.35112527, 0.75820857,
        1.        ],
       [0.22822541, 0.81766069, 0.67455273, ..., 0.16615444, 0.7923069 ,
        1.        ],
       [0.83489252, 0.95492065, 0.93641882, ..., 0.17171657, 0.60901547,
        1.        ],
       ...,
       [0.69896376, 0.9002558 , 0.8698803 , ..., 0.23276807, 0.4479163 ,
        1.        ],
       [0.79622812, 0.91720337, 0.92385034, ..., 0.29536614, 0.3319163 ,
        1.        ],
       [0.68009092, 0.77366722, 0.69761944, ..., 0.27272177, 0.69006956,
        1.        ]])

In [7]:
basis_expansion(x_train,2)

array([[0.5260864 , 0.8129217 , 0.8132486 , ..., 0.12328896, 0.57488024,
        1.        ],
       [0.22822541, 0.81766069, 0.67455273, ..., 0.0276073 , 0.62775022,
        1.        ],
       [0.83489252, 0.95492065, 0.93641882, ..., 0.02948658, 0.37089984,
        1.        ],
       ...,
       [0.69896376, 0.9002558 , 0.8698803 , ..., 0.05418098, 0.20062901,
        1.        ],
       [0.79622812, 0.91720337, 0.92385034, ..., 0.08724116, 0.11016843,
        1.        ],
       [0.68009092, 0.77366722, 0.69761944, ..., 0.07437716, 0.47619599,
        1.        ]])

In [9]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)

for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": [],
            "Depth": []
        }
        for train_index, val_index in kfold.split(x_expansion, y_train):

            model = RandomForestRegressor(max_depth=d, random_state=0)
            model.fit(x_expansion[train_index], y_train[train_index])
            stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["Depth"].append(d)
        stat["MSE"] = np.mean(stat["MSE"])
        stat["R2"] = np.mean(stat["R2"])
        stat["MSPE"] = np.mean(stat["MSPE"])
        stats.append(stat)
        if stat["R2"] > best_stat[0]["R2"]:
            best_stat.pop()
            best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.39674064883268506,
  'R2': 0.697223088178902,
  'MSPE': 1.9539194681936156,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 1,
  'MSE': 0.2918224782050104,
  'R2': 0.7786622455704157,
  'MSPE': 1.4818973808011286,
  'Depth': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]},
 {'n': 1,
  'MSE': 0.2517452637357101,
  'R2': 0.807276974444655,
  'MSPE': 1.2580592133854824,
  'Depth': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]},
 {'n': 2,
  'MSE': 0.3974780605623099,
  'R2': 0.695803565827512,
  'MSPE': 1.9802390329626043,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 2,
  'MSE': 0.291015800939133,
  'R2': 0.7776730002364999,
  'MSPE': 1.4759677749514535,
  'Depth': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]},
 {'n': 2,
  'MSE': 0.24916397502767856,
  'R2': 0.811067500997738,
  'MSPE': 1.2480080461726515,
  'Depth': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]},
 {'n': 3,
  'MSE': 0.3984486894857672,
  'R2': 0.6942506821320087,
  'MSPE': 1.9690055271378366,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 3,
  'MSE

In [10]:
best_stat


[{'n': 5,
  'MSE': 0.24839729273430441,
  'R2': 0.8112708682675818,
  'MSPE': 1.2487793273838945,
  'Depth': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]}]