In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
x_train

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.49140996, 0.33927605,
        0.26055714],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.56352717, 0.46756676,
        0.51302177],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.69905132, 0.24782868,
        0.6415652 ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.59701514, 0.38543332,
        0.18829274],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.66973627, 0.25319004,
        0.32765862],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.57878023, 0.21714573,
        0.38908401]])

In [5]:
basis_expansion(x_train,1)

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.33927605, 0.26055714,
        1.        ],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.46756676, 0.51302177,
        1.        ],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.24782868, 0.6415652 ,
        1.        ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.38543332, 0.18829274,
        1.        ],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.25319004, 0.32765862,
        1.        ],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.21714573, 0.38908401,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.11510824, 0.06789003,
        1.        ],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.21861867, 0.26319133,
        1.        ],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.06141905, 0.41160591,
        1.        ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.14855884, 0.03545416,
        1.        ],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.0641052 , 0.10736017,
        1.        ],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.04715227, 0.15138637,
        1.        ]])

In [9]:
stats = []


for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": []
    }
    for train_index, val_index in kfold.split(x_expansion, y_train):
        
        model = RandomForestRegressor(max_depth=2, random_state=0)
        model.fit(x_expansion[train_index], y_train[train_index])
        stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
    stat["MSE"] = np.mean(stat["MSE"])
    stat["R2"] = np.mean(stat["R2"])
    stat["MSPE"] = np.mean(stat["MSPE"])
    stats.append(stat)
stats

[{'n': 1,
  'MSE': 0.38929271052596204,
  'R2': 0.70428535351608,
  'MSPE': 1.9238732870447737},
 {'n': 2,
  'MSE': 0.3938433241565798,
  'R2': 0.7018448721545187,
  'MSPE': 1.9452915792251069},
 {'n': 3,
  'MSE': 0.38873532212786704,
  'R2': 0.702765240965405,
  'MSPE': 1.9151675607123035},
 {'n': 4,
  'MSE': 0.388284616336256,
  'R2': 0.70720717900893,
  'MSPE': 1.9089633757231907},
 {'n': 5,
  'MSE': 0.38771092171130983,
  'R2': 0.7062831940324961,
  'MSPE': 1.9185329575819583},
 {'n': 6,
  'MSE': 0.3888955838535888,
  'R2': 0.7058091330615508,
  'MSPE': 1.924877891426544},
 {'n': 7,
  'MSE': 0.390205440323452,
  'R2': 0.7061362987312351,
  'MSPE': 1.9262401306137555}]