In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [3]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [5]:
x_train

array([[0.4565977 , 0.70592159, 0.73606389, ..., 0.72987455, 0.22564766,
        0.57765406],
       [0.7357707 , 0.91380167, 0.89871361, ..., 0.74227607, 0.28032336,
        0.41303211],
       [0.56987034, 0.79365999, 0.84474345, ..., 0.74240822, 0.34573647,
        0.25170478],
       ...,
       [0.56775221, 0.79595876, 0.82995712, ..., 0.73572916, 0.37641224,
        0.40174228],
       [0.22359955, 0.77126515, 0.66154074, ..., 0.71419024, 0.11524848,
        0.58404481],
       [0.43228126, 0.71181863, 0.81587314, ..., 0.51524228, 0.41232789,
        0.39373705]])

In [6]:
basis_expansion(x_train,1)

array([[0.4565977 , 0.70592159, 0.73606389, ..., 0.22564766, 0.57765406,
        1.        ],
       [0.7357707 , 0.91380167, 0.89871361, ..., 0.28032336, 0.41303211,
        1.        ],
       [0.56987034, 0.79365999, 0.84474345, ..., 0.34573647, 0.25170478,
        1.        ],
       ...,
       [0.56775221, 0.79595876, 0.82995712, ..., 0.37641224, 0.40174228,
        1.        ],
       [0.22359955, 0.77126515, 0.66154074, ..., 0.11524848, 0.58404481,
        1.        ],
       [0.43228126, 0.71181863, 0.81587314, ..., 0.41232789, 0.39373705,
        1.        ]])

In [7]:
basis_expansion(x_train,2)

array([[0.4565977 , 0.70592159, 0.73606389, ..., 0.05091687, 0.33368422,
        1.        ],
       [0.7357707 , 0.91380167, 0.89871361, ..., 0.07858118, 0.17059553,
        1.        ],
       [0.56987034, 0.79365999, 0.84474345, ..., 0.11953371, 0.0633553 ,
        1.        ],
       ...,
       [0.56775221, 0.79595876, 0.82995712, ..., 0.14168618, 0.16139686,
        1.        ],
       [0.22359955, 0.77126515, 0.66154074, ..., 0.01328221, 0.34110834,
        1.        ],
       [0.43228126, 0.71181863, 0.81587314, ..., 0.17001429, 0.15502886,
        1.        ]])

In [17]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)

samples = [2,5,10]
min_samples_leaf = [1, 2, 4]

for i in range(1,8):
    x_expansion = basis_expansion(x_train,i)
    
    for d in range (2,5):
        for s in samples:
            for l in min_samples_leaf:
                stat = {
                    "n": i,
                    "MSE": [],
                    "R2": [],
                    "MSPE": [],
                    "Depth": d,
                    "Min_sample":s,
                    "Leaf_samples": l
                }
                for train_index, val_index in kfold.split(x_expansion, y_train):

                    model = RandomForestRegressor(max_depth=d, random_state=0, min_samples_split=s, min_samples_leaf=l)
                    model.fit(x_expansion[train_index], y_train[train_index])
                    stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
                    stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
                stat["MSE"] = np.mean(stat["MSE"])
                stat["R2"] = np.mean(stat["R2"])
                stat["MSPE"] = np.mean(stat["MSPE"])
                stats.append(stat)
                if stat["R2"] > best_stat[0]["R2"]:
                    best_stat.pop()
                    best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.4035578531331573,
  'R2': 0.6871396429875037,
  'MSPE': 1.9814491643363397,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.4018204735799781,
  'R2': 0.6886393838167141,
  'MSPE': 1.9939979899680609,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.39836936305793963,
  'R2': 0.6931221001360226,
  'MSPE': 1.9771322370515019,
  'Depth': 2,
  'Min_sample': 2,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.40676288680852946,
  'R2': 0.6857319003706444,
  'MSPE': 2.0064136139422537,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 1},
 {'n': 1,
  'MSE': 0.39687483086131603,
  'R2': 0.6937972698837317,
  'MSPE': 1.966253574987702,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 2},
 {'n': 1,
  'MSE': 0.39910537306012916,
  'R2': 0.6922204293760188,
  'MSPE': 1.9799446640476497,
  'Depth': 2,
  'Min_sample': 5,
  'Leaf_samples': 4},
 {'n': 1,
  'MSE': 0.4007507109768122,
  'R2': 0.6899931912770214,
  'MSPE': 1.980418

In [18]:
best_stat

[{'n': 2,
  'MSE': 0.24962770521480296,
  'R2': 0.8058785638030015,
  'MSPE': 1.2459977438795293,
  'Depth': 4,
  'Min_sample': 10,
  'Leaf_samples': 1}]

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
x_train = basis_expansion(x_train,best_stat[0]["n"])
x_test = basis_expansion(x_test,best_stat[0]["n"])

In [20]:
model = RandomForestRegressor(max_depth=best_stat[0]["Depth"],random_state=0, min_samples_split=best_stat[0]["Min_sample"], min_samples_leaf=best_stat[0]["Leaf_samples"])
model.fit(x_train, y_train)
mse = mean_squared_error(y_test, model.predict(x_test))
r_2 = r2_score(y_test, model.predict(x_test))
mspe = mean_squared_percentage_error(y_test, model.predict(x_test))

In [21]:
mse

0.259269319613042

In [22]:
r_2

0.8053014468204451

In [24]:
mspe

1.3053727190620659