In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import *

In [4]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [5]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [6]:
x_train

array([[0.62946575, 0.85013747, 0.85997336, ..., 0.65389031, 0.32941639,
        0.53804529],
       [0.95002308, 0.80791056, 0.9776726 , ..., 0.66285312, 0.14768755,
        0.85616577],
       [0.57848157, 0.86955368, 0.88540588, ..., 0.71200293, 0.26641792,
        0.5922662 ],
       ...,
       [0.54956506, 0.85905248, 0.78929469, ..., 0.80473065, 0.23561792,
        0.34004632],
       [0.90342677, 0.93833929, 0.95549309, ..., 0.74078178, 0.20158459,
        0.5779885 ],
       [0.39133609, 0.82295787, 0.75232884, ..., 0.69422931, 0.16507852,
        0.61491633]])

In [7]:
basis_expansion(x_train,1)

array([[0.62946575, 0.85013747, 0.85997336, ..., 0.32941639, 0.53804529,
        1.        ],
       [0.95002308, 0.80791056, 0.9776726 , ..., 0.14768755, 0.85616577,
        1.        ],
       [0.57848157, 0.86955368, 0.88540588, ..., 0.26641792, 0.5922662 ,
        1.        ],
       ...,
       [0.54956506, 0.85905248, 0.78929469, ..., 0.23561792, 0.34004632,
        1.        ],
       [0.90342677, 0.93833929, 0.95549309, ..., 0.20158459, 0.5779885 ,
        1.        ],
       [0.39133609, 0.82295787, 0.75232884, ..., 0.16507852, 0.61491633,
        1.        ]])

In [6]:
basis_expansion(x_train,2)

array([[0.67702472, 0.77658302, 0.87431613, ..., 0.05931622, 0.04740117,
        1.        ],
       [0.77484071, 0.8831805 , 0.85346745, ..., 0.08175565, 0.1470603 ,
        1.        ],
       [0.84621312, 0.94086909, 0.93952393, ..., 0.0378974 , 0.17689909,
        1.        ],
       ...,
       [0.72311072, 0.77042299, 0.86884526, ..., 0.02398626, 0.61777147,
        1.        ],
       [0.9183826 , 0.930291  , 0.96968808, ..., 0.03667997, 0.72246508,
        1.        ],
       [0.59293672, 0.88034195, 0.82359899, ..., 0.10503943, 0.91870756,
        1.        ]])

In [8]:
stats = []
best_stat = []

temp = {
    "n": -1,
    "MSE": -1,
    "R2": -1,
    "MSPE": -1,
    "Depth": -1
}

best_stat.append(temp)


for i in range(1,9):
    x_expansion = basis_expansion(x_train,i)
    stat = {
            "n": i,
            "MSE": [],
            "R2": [],
            "MSPE": []
    }
    for train_index, val_index in kfold.split(x_expansion, y_train):
        
        model = RANSACRegressor(min_samples=10, max_trials=200,
                             loss='absolute_error', random_state=12,
                             residual_threshold=10)
        model.fit(x_expansion[train_index], y_train[train_index])
        stat["MSE"].append(mean_squared_error(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["R2"].append(r2_score(y_train[val_index], model.predict(x_expansion[val_index])))
        stat["MSPE"].append(mean_squared_percentage_error(y_train[val_index], model.predict(x_expansion[val_index])))
    stat["MSE"] = np.mean(stat["MSE"])
    stat["R2"] = np.mean(stat["R2"])
    stat["MSPE"] = np.mean(stat["MSPE"])
    stats.append(stat)
    if stat["R2"] > best_stat[0]["R2"]:
        best_stat.pop()
        best_stat.append(stat)
stats

[{'n': 1,
  'MSE': 0.2866886008309331,
  'R2': 0.7772896223678832,
  'MSPE': 1.4068412379199213},
 {'n': 2,
  'MSE': 0.26335811972573187,
  'R2': 0.7947103104641323,
  'MSPE': 1.2721431552573885},
 {'n': 3,
  'MSE': 0.25608917586200086,
  'R2': 0.7993569340806339,
  'MSPE': 1.1913242700075368},
 {'n': 4,
  'MSE': 0.25650162337488824,
  'R2': 0.7977477553748266,
  'MSPE': 1.1995513869034449},
 {'n': 5,
  'MSE': 0.3674457390536509,
  'R2': 0.726128090275125,
  'MSPE': 1.6176034072536385},
 {'n': 6,
  'MSE': 0.48854315459610803,
  'R2': 0.6106053104797614,
  'MSPE': 2.064126690880401},
 {'n': 7,
  'MSE': 0.25642629834310443,
  'R2': 0.7993880896907081,
  'MSPE': 1.2141238032953334},
 {'n': 8,
  'MSE': 92.62369493589053,
  'R2': -66.20569328738223,
  'MSPE': 624.1116070926965}]

In [56]:
best_stat

[{'n': 7,
  'MSE': 0.25642629834310443,
  'R2': 0.7993880896907081,
  'MSPE': 1.2141238032953334}]

In [51]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
x_train = basis_expansion(x_train,best_stat[0]["n"])
x_test = basis_expansion(x_test,best_stat[0]["n"])

In [52]:
model = RANSACRegressor(min_samples=10, max_trials=200,
                             loss='absolute_error', random_state=12,
                             residual_threshold=10)
model.fit(x_train, y_train)
mse = mean_squared_error(y_test, model.predict(x_test))
r_2 = r2_score(y_test, model.predict(x_test))
mspe = mean_squared_percentage_error(y_test, model.predict(x_test))

In [53]:
mse

0.22253287441387032

In [54]:
r_2

0.833638714292985

In [55]:
mspe

0.9921272945840178