In [3]:
import sys
import logging
import warnings

import numpy as np
import scipy as sp

%reload_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [4]:
df = pd.read_csv("../train.csv", index_col=0)
df.head()


Unnamed: 0_level_0,y,x1,x2,x3,x4,x5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-6.822679,0.02,0.05,-0.09,-0.43,-0.08
1,-6.32629,-0.13,0.11,-0.08,-0.29,-0.03
2,-9.302728,0.08,0.06,-0.07,-0.41,-0.03
3,-7.371893,0.02,-0.12,0.01,-0.43,-0.02
4,-6.027647,-0.14,-0.12,-0.08,-0.02,-0.08


In [5]:
df.describe()


Unnamed: 0,y,x1,x2,x3,x4,x5
count,700.0,700.0,700.0,700.0,700.0,700.0
mean,-6.308362,-0.057814,-0.054686,-0.059786,-0.281814,-0.0559
std,1.968879,0.138694,0.120136,0.083381,0.150631,0.052697
min,-12.719333,-0.45,-0.42,-0.35,-0.52,-0.56
25%,-7.657336,-0.15,-0.14,-0.11,-0.39,-0.08
50%,-6.335041,-0.07,-0.06,-0.05,-0.33,-0.05
75%,-5.005213,0.02,0.02,-0.01,-0.2,-0.03
max,1.332858,0.38,0.36,0.19,0.44,0.07


In [6]:
data = df.to_numpy()
X, y = data[:, 1:], data[:, 0]

Phi = np.concatenate(
    (X, X ** 2, np.exp(X), np.cos(X), np.ones((X.shape[0], 1))), axis=1
)
Phi.shape


(700, 21)

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

Ks = list(range(2, 11))
scores = []

for K in Ks:
    results = cross_validate(
        LinearRegression(fit_intercept=False),
        Phi,
        y,
        cv=K,
        scoring="neg_root_mean_squared_error",
    )
    scores += [-results["test_score"].mean()]

scores


[2.5050351507700395,
 2.538780042499441,
 2.0903173031122084,
 2.099364722928896,
 2.1863566496901394,
 2.1156335029784876,
 2.048492339811766,
 2.061990988756873,
 2.090929493436905]

## Ridge

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

lambdas = [0.001, 0.005, 0.01, 0.25, 0.1, 0.3, 0.5] + list(range(1, 20)) + [50, 100]
param_grid = {"alpha": lambdas}

clf = GridSearchCV(
    Ridge(fit_intercept=False), param_grid, cv=5, scoring="neg_root_mean_squared_error"
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]

print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (11) had average score 1.960164319155829 on validation set


In [9]:
lambdas = list(np.linspace(10, 12, 100))
param_grid = {"alpha": lambdas}

scores = []

clf = GridSearchCV(
    Ridge(fit_intercept=False), param_grid, cv=5, scoring="neg_root_mean_squared_error"
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]


print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (11.070707070707071) had average score 1.960164302519951 on validation set


## Lasso

In [10]:
from sklearn.linear_model import Lasso

lambdas = [0.001, 0.005, 0.01, 0.25, 0.1, 0.3, 0.5] + list(range(1, 20)) + [50, 100]
param_grid = {"alpha": lambdas}

scores = []

clf = GridSearchCV(
    Lasso(fit_intercept=False, max_iter=10000),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]


print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (0.01) had average score 1.960050279175897 on validation set


In [11]:
lambdas = list(np.linspace(0.005, 0.25, 100))
param_grid = {"alpha": lambdas}

scores = []

clf = GridSearchCV(
    Lasso(fit_intercept=False, max_iter=10000),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]


print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (0.03469696969696969) had average score 1.9579524087438933 on validation set


## Huber loss

In [12]:
from sklearn.linear_model import HuberRegressor

lambdas = [0.001, 0.005, 0.01, 0.25, 0.1, 0.3, 0.5] + list(range(1, 21)) + [50, 100]
param_grid = {"alpha": lambdas, "max_iter": [1000]}

scores = []

clf = GridSearchCV(
    HuberRegressor(fit_intercept=False),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]

print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (20) had average score 1.960586821311555 on validation set


In [13]:
lambdas = list(np.linspace(19.0, 22.0, 100))
param_grid = {"alpha": lambdas, "max_iter": [1000]}

scores = []

clf = GridSearchCV(
    HuberRegressor(fit_intercept=False),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]

print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (21.96969696969697) had average score 1.960572506287732 on validation set


In [14]:
lambdas = list(np.linspace(21.8, 23.0, 100))
param_grid = {"alpha": lambdas, "max_iter": [1000]}

scores = []

clf = GridSearchCV(
    HuberRegressor(fit_intercept=False),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
)
clf.fit(Phi, y)

scores = -clf.cv_results_["mean_test_score"]
best_lambda = clf.best_params_["alpha"]

print(f"Best lambda ({best_lambda}) had average score {min(scores)} on validation set")



Best lambda (21.96969696969697) had average score 1.960572506287732 on validation set


## Bagging

In [28]:
K = 8
scores = cross_validate(
    Ridge(alpha=22.0, fit_intercept=False), Phi, y, cv=K, return_estimator=True
)

bagging_weights = np.zeros(Phi.shape[1])
for estimator in scores["estimator"]:
    bagging_weights += estimator.coef_ / K

print(bagging_weights)


[ 0.08172633 -0.15688982 -0.23201978  0.20436172  0.06445781 -0.07404427
  0.03332326  0.04181989 -0.09066717  0.00871613 -0.54409555 -0.72937594
 -0.80117497 -0.42567185 -0.52064818 -0.5511962  -0.60454698 -0.60876488
 -0.5434128  -0.59206614 -0.58798886]
