In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
x_train

array([[0.67702472, 0.77658302, 0.87431613, ..., 0.46946874, 0.24354921,
        0.2177181 ],
       [0.77484071, 0.8831805 , 0.85346745, ..., 0.76394188, 0.28592944,
        0.38348442],
       [0.84621312, 0.94086909, 0.93952393, ..., 0.75164288, 0.19467255,
        0.42059374],
       ...,
       [0.72311072, 0.77042299, 0.86884526, ..., 0.78543299, 0.15487497,
        0.7859844 ],
       [0.9183826 , 0.930291  , 0.96968808, ..., 0.75563037, 0.19152015,
        0.84997946],
       [0.59293672, 0.88034195, 0.82359899, ..., 0.77463913, 0.32409787,
        0.95849234]])

In [5]:
basis_expansion(x_train,1)

array([[0.67702472, 0.77658302, 0.87431613, ..., 0.24354921, 0.2177181 ,
        1.        ],
       [0.77484071, 0.8831805 , 0.85346745, ..., 0.28592944, 0.38348442,
        1.        ],
       [0.84621312, 0.94086909, 0.93952393, ..., 0.19467255, 0.42059374,
        1.        ],
       ...,
       [0.72311072, 0.77042299, 0.86884526, ..., 0.15487497, 0.7859844 ,
        1.        ],
       [0.9183826 , 0.930291  , 0.96968808, ..., 0.19152015, 0.84997946,
        1.        ],
       [0.59293672, 0.88034195, 0.82359899, ..., 0.32409787, 0.95849234,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.67702472, 0.77658302, 0.87431613, ..., 0.05931622, 0.04740117,
        1.        ],
       [0.77484071, 0.8831805 , 0.85346745, ..., 0.08175565, 0.1470603 ,
        1.        ],
       [0.84621312, 0.94086909, 0.93952393, ..., 0.0378974 , 0.17689909,
        1.        ],
       ...,
       [0.72311072, 0.77042299, 0.86884526, ..., 0.02398626, 0.61777147,
        1.        ],
       [0.9183826 , 0.930291  , 0.96968808, ..., 0.03667997, 0.72246508,
        1.        ],
       [0.59293672, 0.88034195, 0.82359899, ..., 0.10503943, 0.91870756,
        1.        ]])

In [14]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)


for i in range(1,9):
    x_expansion = basis_expansion(x_train,i)
    stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": []
    }
    for train_index, val_index in kfold.split(x_expansion, y_train):
        
        model = RANSACRegressor(min_samples=10, max_trials=200,
                             loss='absolute_error', random_state=12,
                             residual_threshold=10)
        model.fit(x_expansion[train_index], y_train[train_index])
        stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
    stat["MSE"] = np.mean(stat["MSE"])
    stat["R2"] = np.mean(stat["R2"])
    stat["MSPE"] = np.mean(stat["MSPE"])
    stats.append(stat)
    if stat["R2"] > best_stat[0]["R2"]:
        best_stat.pop()
        best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.2933026286879105,
  'R2': 0.7717256611942822,
  'MSPE': 1.4388976742571857},
 {'n': 2,
  'MSE': 0.2627031411561614,
  'R2': 0.7964131923121479,
  'MSPE': 1.2638602424395153},
 {'n': 3,
  'MSE': 0.26691236923515943,
  'R2': 0.7915438274715412,
  'MSPE': 1.2586786308940678},
 {'n': 4,
  'MSE': 0.3113028109447855,
  'R2': 0.7604921824294718,
  'MSPE': 1.6003197570238061},
 {'n': 5,
  'MSE': 0.5876545283814798,
  'R2': 0.5574681741243068,
  'MSPE': 3.5413078822608703},
 {'n': 6,
  'MSE': 1.8658557717361213,
  'R2': -0.41274277149279115,
  'MSPE': 12.753336756122131},
 {'n': 7,
  'MSE': 7312.916973590221,
  'R2': -6188.509232819182,
  'MSPE': 51887.12479842932},
 {'n': 8,
  'MSE': 35587.71950067166,
  'R2': -30100.065717849637,
  'MSPE': 252507.87672223133}]

In [15]:
best_stat

[{'n': 2,
  'MSE': 0.2627031411561614,
  'R2': 0.7964131923121479,
  'MSPE': 1.2638602424395153}]