In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [2]:
x_train, x_test, y_train, y_test = gather_data()
kfold = KFold(n_splits=10, shuffle=True)

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train

array([[0.59691466, 0.54038894, 0.84252554, ..., 0.40171635, 0.24253631,
        0.61630201],
       [0.3711317 , 0.63775593, 0.74567504, ..., 0.64917082, 0.24598596,
        0.89956087],
       [0.92383477, 0.95814401, 0.94750857, ..., 0.74425793, 0.21106331,
        0.57473975],
       ...,
       [0.54243438, 0.81330019, 0.81502294, ..., 0.78726292, 0.33403653,
        0.68708342],
       [0.44528346, 0.83141011, 0.70708269, ..., 0.66721171, 0.19417724,
        0.40184477],
       [0.70435793, 0.92250597, 0.84666569, ..., 0.54543263, 0.20621966,
        0.50782478]])

In [4]:
basis_expansion(x_train,1)

array([[0.59691466, 0.54038894, 0.84252554, ..., 0.24253631, 0.61630201,
        1.        ],
       [0.3711317 , 0.63775593, 0.74567504, ..., 0.24598596, 0.89956087,
        1.        ],
       [0.92383477, 0.95814401, 0.94750857, ..., 0.21106331, 0.57473975,
        1.        ],
       ...,
       [0.54243438, 0.81330019, 0.81502294, ..., 0.33403653, 0.68708342,
        1.        ],
       [0.44528346, 0.83141011, 0.70708269, ..., 0.19417724, 0.40184477,
        1.        ],
       [0.70435793, 0.92250597, 0.84666569, ..., 0.20621966, 0.50782478,
        1.        ]])

In [5]:
basis_expansion(x_train,2)

array([[0.59691466, 0.54038894, 0.84252554, ..., 0.05882386, 0.37982817,
        1.        ],
       [0.3711317 , 0.63775593, 0.74567504, ..., 0.06050909, 0.80920976,
        1.        ],
       [0.92383477, 0.95814401, 0.94750857, ..., 0.04454772, 0.33032578,
        1.        ],
       ...,
       [0.54243438, 0.81330019, 0.81502294, ..., 0.1115804 , 0.47208363,
        1.        ],
       [0.44528346, 0.83141011, 0.70708269, ..., 0.0377048 , 0.16147922,
        1.        ],
       [0.70435793, 0.92250597, 0.84666569, ..., 0.04252655, 0.25788601,
        1.        ]])

In [6]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)


for i in range(1,9):
    x_expansion = basis_expansion(x_train,i)
    stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": []
    }
    for train_index, val_index in kfold.split(x_expansion, y_train):
        
        model = RANSACRegressor(min_samples=10, max_trials=200,
                             loss='absolute_error', random_state=12,
                             residual_threshold=10)
        model.fit(x_expansion[train_index], y_train[train_index])
        stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
    stat["MSE"] = np.mean(stat["MSE"])
    stat["R2"] = np.mean(stat["R2"])
    stat["MSPE"] = np.mean(stat["MSPE"])
    stats.append(stat)
    if stat["R2"] > best_stat[0]["R2"]:
        best_stat.pop()
        best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.2907790177284104,
  'R2': 0.7729220421248381,
  'MSPE': 1.405066806845282},
 {'n': 2,
  'MSE': 0.26388404458731884,
  'R2': 0.7933153976455418,
  'MSPE': 1.255984080565467},
 {'n': 3,
  'MSE': 0.2743758153039756,
  'R2': 0.7836307891467473,
  'MSPE': 1.3012282394349586},
 {'n': 4,
  'MSE': 0.3091953048498569,
  'R2': 0.7598518617142409,
  'MSPE': 1.5600695844836354},
 {'n': 5,
  'MSE': 0.27245389380555385,
  'R2': 0.7846017055485179,
  'MSPE': 1.293741094821753},
 {'n': 6,
  'MSE': 7.8959115748444875,
  'R2': -5.618504567632564,
  'MSPE': 55.37914898948425},
 {'n': 7,
  'MSE': 147493.1338142447,
  'R2': -105983.3513193551,
  'MSPE': 1046517.3006092433},
 {'n': 8,
  'MSE': 52703.418133433115,
  'R2': -37204.89454853794,
  'MSPE': 373949.4799212336}]

In [7]:
best_stat

[{'n': 2,
  'MSE': 0.26388404458731884,
  'R2': 0.7933153976455418,
  'MSPE': 1.255984080565467}]

In [8]:
x_train = basis_expansion(x_train,best_stat[0]["n"])
x_test = basis_expansion(x_test,best_stat[0]["n"])

In [9]:
model = RANSACRegressor(min_samples=10, max_trials=200,
                             loss='absolute_error', random_state=12,
                             residual_threshold=10)
model.fit(x_train, y_train)
mse = mean_squared_error(y_test, model.predict(x_test))
r_2 = r2_score(y_test, model.predict(x_test))
mspe = mean_squared_percentage_error(y_test, model.predict(x_test))

In [10]:
mse

0.23420397817920552

In [11]:
r_2

0.8318889692591606

In [12]:
mspe

1.183527215943002