In [72]:
import sys
import logging
import warnings

import numpy as np
import scipy as sp

%reload_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [73]:
df = pd.read_csv('../train.csv', index_col=0)
df.head()

Unnamed: 0_level_0,y,x1,x2,x3,x4,x5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-6.822679,0.02,0.05,-0.09,-0.43,-0.08
1,-6.32629,-0.13,0.11,-0.08,-0.29,-0.03
2,-9.302728,0.08,0.06,-0.07,-0.41,-0.03
3,-7.371893,0.02,-0.12,0.01,-0.43,-0.02
4,-6.027647,-0.14,-0.12,-0.08,-0.02,-0.08


In [74]:
df.describe()

Unnamed: 0,y,x1,x2,x3,x4,x5
count,700.0,700.0,700.0,700.0,700.0,700.0
mean,-6.308362,-0.057814,-0.054686,-0.059786,-0.281814,-0.0559
std,1.968879,0.138694,0.120136,0.083381,0.150631,0.052697
min,-12.719333,-0.45,-0.42,-0.35,-0.52,-0.56
25%,-7.657336,-0.15,-0.14,-0.11,-0.39,-0.08
50%,-6.335041,-0.07,-0.06,-0.05,-0.33,-0.05
75%,-5.005213,0.02,0.02,-0.01,-0.2,-0.03
max,1.332858,0.38,0.36,0.19,0.44,0.07


In [75]:
data = df.to_numpy()
X, y = data[:, 1:], data[:, 0]

Phi = np.concatenate((X, X**2, np.exp(X), np.cos(X), np.ones((X.shape[0], 1))), axis=1)
Phi.shape

(700, 21)

In [76]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

Ks = [2, 5, 10]
scores = []

for K in Ks:
    results = cross_validate(LinearRegression(), Phi, y, cv=K, scoring='neg_root_mean_squared_error')
    scores += [-results['test_score'].mean()]

scores

[2.5050351506821755, 2.0993647229310843, 2.090929493439855]

## Ridge

In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

Ks = np.arange(2, 10)
lambdas = [.001, .005, .01, .25, .1, .3, .5] + list(range(1,20)) + [50, 100]
param_grid = {'alpha': lambdas}

best_scores = dict.fromkeys(Ks)
best_lambdas = dict.fromkeys(Ks)

for K in Ks:
    clf = GridSearchCV(Ridge(), param_grid, cv=K, scoring='neg_root_mean_squared_error')
    clf.fit(Phi, y)

    best_scores[K] = min(-clf.cv_results_['mean_test_score'])
    best_lambdas[K] = clf.best_estimator_.get_params()['alpha']

print('Best lambdas:')
for K in Ks:
    print(f'K={K}:     {best_lambdas[K]} with score {best_scores[K]}')

Best lambdas:
K=2:     19 with score 1.9641841653229637
K=3:     4 with score 1.956727582642327
K=4:     4 with score 1.95814408487681
K=5:     4 with score 1.9614694302731703
K=6:     3 with score 1.9530755999700011
K=7:     5 with score 1.9588700209493335
K=8:     5 with score 1.9499058563629912
K=9:     3 with score 1.9555750247079782


In [84]:
lambdas = list(np.linspace(2, 6, 100))
param_grid = {'alpha': lambdas}

scores = []

clf = GridSearchCV(Ridge(), param_grid, cv=8, scoring='neg_root_mean_squared_error')
clf.fit(Phi, y)

scores = -clf.cv_results_['mean_test_score']
best_lambda = clf.best_estimator_.get_params()['alpha']


print(f'Best lambda ({best_lambda}) had average score {min(scores)} on validation set')

Best lambda (4.787878787878788) had average score 1.9499034976074852 on validation set


## Lasso

In [79]:
from sklearn.linear_model import Lasso

lambdas = [.001, .005, .01, .25, .1, .3, .5] + list(range(1,20)) + [50, 100]
param_grid = {'alpha': lambdas}

scores = []

clf = GridSearchCV(Lasso(), param_grid, cv=8, scoring='neg_root_mean_squared_error')
clf.fit(Phi, y)

scores = -clf.cv_results_['mean_test_score']
best_lambda = clf.best_estimator_.get_params()['alpha']


print(f'Best lambda ({best_lambda}) had average score {min(scores)} on validation set')

Best lambda (0.005) had average score 1.9459369332624643 on validation set


In [80]:
lambdas = list(np.linspace(0.001, 0.01, 100))
param_grid = {'alpha': lambdas}

scores = []

clf = GridSearchCV(Lasso(), param_grid, cv=8, scoring='neg_root_mean_squared_error')
clf.fit(Phi, y)

scores = -clf.cv_results_['mean_test_score']
best_lambda = clf.best_estimator_.get_params()['alpha']


print(f'Best lambda ({best_lambda}) had average score {min(scores)} on validation set')

Best lambda (0.0038181818181818187) had average score 1.9456735716763538 on validation set


## Huber loss

In [82]:
from sklearn.linear_model import HuberRegressor

lambdas = [.001, .005, .01, .25, .1, .3, .5] + list(range(1,20)) + [50, 100]
param_grid = {'alpha': lambdas, 'max_iter': [1000]}

scores = []

clf = GridSearchCV(HuberRegressor(), param_grid, cv=8, scoring='neg_root_mean_squared_error')
clf.fit(Phi, y)

scores = -clf.cv_results_['mean_test_score']
best_lambda = clf.best_estimator_.get_params()['alpha']

print(f'Best lambda ({best_lambda}) had average score {min(scores)} on validation set')


Best lambda (3) had average score 1.9502500174257702 on validation set


In [83]:
lambdas = list(np.linspace(2.5, 3.5, 100))
param_grid = {'alpha': lambdas, 'max_iter': [1000]}

scores = []

clf = GridSearchCV(HuberRegressor(), param_grid, cv=8, scoring='neg_root_mean_squared_error')
clf.fit(Phi, y)

scores = -clf.cv_results_['mean_test_score']
best_lambda = clf.best_estimator_.get_params()['alpha']

print(f'Best lambda ({best_lambda}) had average score {min(scores)} on validation set')

Best lambda (2.8737373737373737) had average score 1.950221632454722 on validation set
