In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
x_train

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.49140996, 0.33927605,
        0.26055714],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.56352717, 0.46756676,
        0.51302177],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.69905132, 0.24782868,
        0.6415652 ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.59701514, 0.38543332,
        0.18829274],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.66973627, 0.25319004,
        0.32765862],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.57878023, 0.21714573,
        0.38908401]])

In [5]:
basis_expansion(x_train,1)

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.33927605, 0.26055714,
        1.        ],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.46756676, 0.51302177,
        1.        ],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.24782868, 0.6415652 ,
        1.        ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.38543332, 0.18829274,
        1.        ],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.25319004, 0.32765862,
        1.        ],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.21714573, 0.38908401,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.34333197, 0.52859724, 0.67839715, ..., 0.11510824, 0.06789003,
        1.        ],
       [0.31014674, 0.61620486, 0.65289074, ..., 0.21861867, 0.26319133,
        1.        ],
       [0.28185526, 0.56316155, 0.6810587 , ..., 0.06141905, 0.41160591,
        1.        ],
       ...,
       [0.76849751, 0.83233321, 0.94041108, ..., 0.14855884, 0.03545416,
        1.        ],
       [0.78515607, 0.92237872, 0.91017304, ..., 0.0641052 , 0.10736017,
        1.        ],
       [0.83358933, 0.79772383, 0.97745081, ..., 0.04715227, 0.15138637,
        1.        ]])

In [12]:
stats = []


for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": [],
            "Depth": []
        }
        for train_index, val_index in kfold.split(x_expansion, y_train):

            model = RandomForestRegressor(max_depth=d, random_state=0)
            model.fit(x_expansion[train_index], y_train[train_index])
            stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["Depth"].append(d)
        stat["MSE"] = np.mean(stat["MSE"])
        stat["R2"] = np.mean(stat["R2"])
        stat["MSPE"] = np.mean(stat["MSPE"])
        stats.append(stat)
stats

[{'n': 1,
  'MSE': 0.38886844711605884,
  'R2': 0.7059578621462937,
  'MSPE': 1.9400488241791627,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 1,
  'MSE': 0.2952002192047957,
  'R2': 0.7767032185496832,
  'MSPE': 1.4962882112259621,
  'Depth': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]},
 {'n': 1,
  'MSE': 0.2503767423493141,
  'R2': 0.8099876377897965,
  'MSPE': 1.2605608525722916,
  'Depth': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]},
 {'n': 2,
  'MSE': 0.39060194008587346,
  'R2': 0.7036240817551538,
  'MSPE': 1.9353816114980895,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 2,
  'MSE': 0.2919247852910858,
  'R2': 0.7756946537288036,
  'MSPE': 1.4746112112255547,
  'Depth': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]},
 {'n': 2,
  'MSE': 0.252620863909025,
  'R2': 0.8083860522922883,
  'MSPE': 1.2597792286187368,
  'Depth': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]},
 {'n': 3,
  'MSE': 0.38817829613391125,
  'R2': 0.7074912547146812,
  'MSPE': 1.9311771913772873,
  'Depth': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]},
 {'n': 3,
 