In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
x_train

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.68706363, 0.3246991 ,
        0.19853479],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.79599208, 0.25596791,
        0.53549528],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.55898947, 0.18982141,
        0.4608126 ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.82477254, 0.27805644,
        0.29837251],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.79179639, 0.2081836 ,
        0.46723404],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.6822111 , 0.23596877,
        0.55879074]])

In [5]:
basis_expansion(x_train,1)

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.3246991 , 0.19853479,
        1.        ],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.25596791, 0.53549528,
        1.        ],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.18982141, 0.4608126 ,
        1.        ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.27805644, 0.29837251,
        1.        ],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.2081836 , 0.46723404,
        1.        ],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.23596877, 0.55879074,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.10542951, 0.03941606,
        1.        ],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.06551957, 0.2867552 ,
        1.        ],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.03603217, 0.21234825,
        1.        ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.07731539, 0.08902615,
        1.        ],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.04334041, 0.21830765,
        1.        ],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.05568126, 0.31224709,
        1.        ]])

In [43]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)

samples = [2,5,10]
min_samples_leaf = [1, 2, 4]

for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        for s in samples:
            for l in min_samples_leaf:
                stat = {
                    "n": i,
                    "MSE": [],
                    "R2": [],
                    "MSPE": [],
                    "Depth": d,
                    "Min_sample":s,
                    "Leaf_samples": l
                }
                for train_index, val_index in kfold.split(x_expansion, y_train):

                    model = RandomForestRegressor(max_depth=d, random_state=0, min_samples_split=s, min_samples_leaf=l)
                    model.fit(x_expansion[train_index], y_train[train_index])
                    stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
                stat["MSE"] = np.mean(stat["MSE"])
                stat["R2"] = np.mean(stat["R2"])
                stat["MSPE"] = np.mean(stat["MSPE"])
                stats.append(stat)
                if stat["R2"] > best_stat[0]["R2"]:
                    best_stat.pop()
                    best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.37808064364415384,
  'R2': 0.7069336091544832,
  'MSPE': 1.7137840763744674,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.38476651308589477,
  'R2': 0.6995791933743023,
  'MSPE': 1.7380940266594909,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.3883321305622852,
  'R2': 0.6979967179941056,
  'MSPE': 1.7641896863834652,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.38553347442412095,
  'R2': 0.6976468753286553,
  'MSPE': 1.7493727460230477,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.3905785697242331,
  'R2': 0.6961745596001463,
  'MSPE': 1.7706663669990277,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.3841195117235431,
  'R2': 0.6997490864880633,
  'MSPE': 1.7435189303696603,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.3832861561022566,
  'R2': 0.7026880394734012,
  'MSPE': 1.732906

In [50]:
best_stat

[{'n': 6,
  'MSE': 0.24406138301164365,
  'R2': 0.8108643663432991,
  'MSPE': 1.1590217701286334,
  'Depth': 4,
  'Min_sample': 10,
  'Leaf_samples': 2}]

In [45]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
x_train = basis_expansion(x_train,best_stat[0]["n"])
x_test = basis_expansion(x_test,best_stat[0]["n"])

10

In [64]:
model = RandomForestRegressor(max_depth=best_stat[0]["Depth"],random_state=0, min_samples_split=best_stat[0]["Min_sample"], min_samples_leaf=best_stat[0]["Leaf_samples"])
model.fit(x_train, y_train)
mse = mean_squared_error(y_test, model.predict(x_test))
r_2 = r2_score(y_test, model.predict(x_test))
mspe = mean_squared_percentage_error(y_test, model.predict(x_test))

In [65]:
mse

0.26760107180585846

In [66]:
r_2

0.786572969806818

In [67]:
mspe

1.2857617986153596