In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
x_train

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.68706363, 0.3246991 ,
        0.19853479],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.79599208, 0.25596791,
        0.53549528],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.55898947, 0.18982141,
        0.4608126 ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.82477254, 0.27805644,
        0.29837251],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.79179639, 0.2081836 ,
        0.46723404],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.6822111 , 0.23596877,
        0.55879074]])

In [5]:
basis_expansion(x_train,1)

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.3246991 , 0.19853479,
        1.        ],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.25596791, 0.53549528,
        1.        ],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.18982141, 0.4608126 ,
        1.        ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.27805644, 0.29837251,
        1.        ],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.2081836 , 0.46723404,
        1.        ],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.23596877, 0.55879074,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.66676948, 0.9066931 , 0.85731182, ..., 0.10542951, 0.03941606,
        1.        ],
       [0.60243466, 0.77939796, 0.86854947, ..., 0.06551957, 0.2867552 ,
        1.        ],
       [0.70695595, 0.91674048, 0.8765341 , ..., 0.03603217, 0.21234825,
        1.        ],
       ...,
       [0.65310752, 0.90740263, 0.90728971, ..., 0.07731539, 0.08902615,
        1.        ],
       [0.69982546, 0.90305102, 0.91054272, ..., 0.04334041, 0.21830765,
        1.        ],
       [0.49320658, 0.73346853, 0.69540147, ..., 0.05568126, 0.31224709,
        1.        ]])

In [8]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)

samples = [2,5,10]
min_samples_leaf = [1, 2, 4]

for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        for s in samples:
            stat = {
                "n": i,
                "MSE": [],
                "R2": [],
                "MSPE": [],
                "Depth": [],
                "Min_sample":[]
            }
            for train_index, val_index in kfold.split(x_expansion, y_train):

                model = RandomForestRegressor(max_depth=d, random_state=0, min_samples_split=s)
                model.fit(x_expansion[train_index], y_train[train_index])
                stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
                stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
                stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
            stat["Depth"].append(d)
            stat["Min_sample"].append(s)
            stat["MSE"] = np.mean(stat["MSE"])
            stat["R2"] = np.mean(stat["R2"])
            stat["MSPE"] = np.mean(stat["MSPE"])
            stats.append(stat)
            if stat["R2"] > best_stat[0]["R2"]:
                best_stat.pop()
                best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.4031786292000481,
  'R2': 0.691536794669091,
  'MSPE': 1.965594327562058,
  'Depth': [2],
  'Min_sample': [2]},
 {'n': 1,
  'MSE': 0.39925691558118687,
  'R2': 0.6942737583787743,
  'MSPE': 1.9608150777818374,
  'Depth': [2],
  'Min_sample': [5]},
 {'n': 1,
  'MSE': 0.3977506766251509,
  'R2': 0.6953860720195673,
  'MSPE': 1.9601205707935143,
  'Depth': [2],
  'Min_sample': [10]},
 {'n': 1,
  'MSE': 0.29586751769256187,
  'R2': 0.7730581496371263,
  'MSPE': 1.4933306299584999,
  'Depth': [3],
  'Min_sample': [2]},
 {'n': 1,
  'MSE': 0.2998488145408818,
  'R2': 0.7719040085911884,
  'MSPE': 1.5074745386786044,
  'Depth': [3],
  'Min_sample': [5]},
 {'n': 1,
  'MSE': 0.29225129943705264,
  'R2': 0.773326621592995,
  'MSPE': 1.4819285886257088,
  'Depth': [3],
  'Min_sample': [10]},
 {'n': 1,
  'MSE': 0.25484047079412464,
  'R2': 0.803545752064888,
  'MSPE': 1.2760691700440743,
  'Depth': [4],
  'Min_sample': [2]},
 {'n': 1,
  'MSE': 0.25636041910956064,
  'R2': 0.804

In [20]:
best_stat

[{'n': 3,
  'MSE': 0.2502149533927715,
  'R2': 0.8093204051826082,
  'MSPE': 1.2499092194678019,
  'Depth': [4],
  'Min_sample': [5]}]

In [36]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
x_train = basis_expansion(x_train,best_stat[0]["n"])
x_test = basis_expansion(x_test,best_stat[0]["n"])

In [37]:
model = RandomForestRegressor(max_depth=best_stat[0]["Depth"][0],random_state=0, min_samples_split=5)
model.fit(x_train, y_train)
mse = mean_squared_error(y_test, model.predict(x_test))
r_2 = r2_score(y_test, model.predict(x_test))
mspe = mean_squared_percentage_error(y_test, model.predict(x_test))

In [38]:
mse

0.2650352446157085

In [39]:
r_2

0.8037981644770132

In [40]:
mspe

1.5857716042789818