In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [3]:
x_train, x_test, y_train, y_test = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
basis_expansion(x_train,1)

array([[0.59691466, 0.54038894, 0.84252554, ..., 0.24253631, 0.61630201,
        1.        ],
       [0.3711317 , 0.63775593, 0.74567504, ..., 0.24598596, 0.89956087,
        1.        ],
       [0.92383477, 0.95814401, 0.94750857, ..., 0.21106331, 0.57473975,
        1.        ],
       ...,
       [0.54243438, 0.81330019, 0.81502294, ..., 0.33403653, 0.68708342,
        1.        ],
       [0.44528346, 0.83141011, 0.70708269, ..., 0.19417724, 0.40184477,
        1.        ],
       [0.70435793, 0.92250597, 0.84666569, ..., 0.20621966, 0.50782478,
        1.        ]])

In [5]:
basis_expansion(x_train,2)

array([[0.59691466, 0.54038894, 0.84252554, ..., 0.05882386, 0.37982817,
        1.        ],
       [0.3711317 , 0.63775593, 0.74567504, ..., 0.06050909, 0.80920976,
        1.        ],
       [0.92383477, 0.95814401, 0.94750857, ..., 0.04454772, 0.33032578,
        1.        ],
       ...,
       [0.54243438, 0.81330019, 0.81502294, ..., 0.1115804 , 0.47208363,
        1.        ],
       [0.44528346, 0.83141011, 0.70708269, ..., 0.0377048 , 0.16147922,
        1.        ],
       [0.70435793, 0.92250597, 0.84666569, ..., 0.04252655, 0.25788601,
        1.        ]])

In [6]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)

samples = [2,5,10]
min_samples_leaf = [1, 2, 4]

for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        for s in samples:
            for l in min_samples_leaf:
                stat = {
                    "n": i,
                    "MSE": [],
                    "R2": [],
                    "MSPE": [],
                    "Depth": d,
                    "Min_sample":s,
                    "Leaf_samples": l
                }
                for train_index, val_index in kfold.split(x_expansion, y_train):

                    model = RandomForestRegressor(max_depth=d, random_state=0, min_samples_split=s, min_samples_leaf=l)
                    model.fit(x_expansion[train_index], y_train[train_index])
                    stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
                stat["MSE"] = np.mean(stat["MSE"])
                stat["R2"] = np.mean(stat["R2"])
                stat["MSPE"] = np.mean(stat["MSPE"])
                stats.append(stat)
                if stat["R2"] > best_stat[0]["R2"]:
                    best_stat.pop()
                    best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.38908989827812834,
  'R2': 0.6957225566930937,
  'MSPE': 1.9349858294514892,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.39735981199554976,
  'R2': 0.68892616712995,
  'MSPE': 1.9646262987613656,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.3905109555924028,
  'R2': 0.6947055964242613,
  'MSPE': 1.9422112489410897,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.3856769696781049,
  'R2': 0.6969037811713965,
  'MSPE': 1.9216720660257924,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.3872856941342572,
  'R2': 0.6918940318231249,
  'MSPE': 1.9244301820666494,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.390702147685435,
  'R2': 0.6943710413286748,
  'MSPE': 1.9448536403645722,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.3909262921933883,
  'R2': 0.6948398224419489,
  'MSPE': 1.9420478402

In [7]:
best_stat

[{'n': 1,
  'MSE': 0.2497741829127314,
  'R2': 0.8046396101552427,
  'MSPE': 1.2561352141370694,
  'Depth': 4,
  'Min_sample': 2,
  'Leaf_samples': 4}]

In [4]:
x_train = basis_expansion(x_train, 1)
x_test = basis_expansion(x_test, 1)

In [5]:
model = RandomForestRegressor(max_depth=4, min_samples_split=2, min_samples_leaf=4)
model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print(f"Train MSE: {mean_squared_error(y_train, y_pred_train)}")
print(f"Train R2: {r2_score(y_train, y_pred_train)}")
print(f"Train MSPE: {mean_squared_percentage_error(y_train, y_pred_train)}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred_test)}")
print(f"Test R2: {r2_score(y_test, y_pred_test)}")
print(f"Test MSPE: {mean_squared_percentage_error(y_test, y_pred_test)}")

Train MSE: 0.21744987307997413
Train R2: 0.8309055245433485
Train MSPE: 1.0762707358175807
Test MSE: 0.2278121361656486
Test R2: 0.8364770174963663
Test MSPE: 1.1844067920927464
