In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# np.random.seed(123)
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Bigger font
# sns.set_context("poster")
# sns.set(style="white", font_scale=2)
# Gray style
sns.set(font_scale=2)
plt.style.use('fivethirtyeight')
# Figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 10, 4
plt.rc('figure.subplot', wspace=.33)
# Slides
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
        'width': 1024,
        'height': 768,
        'scroll': True,
});

- When using a polynomial model, a too high degree = overfitting
- If parameteres are too large e.g. [130, -525.8, 102.6], the model is overfitted

## Validation Strategies

There are 4 recommended ways to avoid overfitting.

Having a dataset with $N$ samples (this would be the TRAIN set in Kaggle competitions).

In [3]:
X_train = np.array([[1, 1], 
              [2, 2], 
              [3, 3], 
              [4, 4], 
              [5, 5], 
              [6, 6]])
y_train = np.array([1, 1, 1, 1, 2, 2])

## a. Hold-out Validation

- This shuffles and splits the data into TRAIN and TEST.
- In Deep Learning, this is what's preferred.

![](overfitting1.png)

In [4]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=1, test_size=.20)

for a_index, e_index in rs.split(X_train):
    print("TRAIN:", a_index, "TEST:", e_index)
    X_a, X_e = X_train[a_index], X_train[e_index]
    y_a, y_e = y_train[a_index], y_train[e_index]

TRAIN: [0 2 3 5] TEST: [4 1]


## b. K-fold / K-fold Cross Validation

- This can be seen as a repeated hold-out and we use every part of TRAIN as VALIDATION in each of the K iterations.
- Then, we need to average scores over these K folds.

![](images/kfold_iterations.jpg)

In [5]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

for a_index, e_index in kf.split(X_train):
    print("TRAIN:", a_index, "TEST:", e_index)
    X_a, X_e = X_train[a_index], X_train[e_index]
    y_a, y_e = y_train[a_index], y_train[e_index]

TRAIN: [2 3 4 5] TEST: [0 1]
TRAIN: [0 1 4 5] TEST: [2 3]
TRAIN: [0 1 2 3] TEST: [4 5]


## c. Leave-one-out Validation

- Here the number of folds is the length of TRAIN, which is $N$.
- It does the same as K fold but it splits the data into $N - 1$ TRAIN samples and $1$ sample is the TEST set.
- It can be super slow

In [6]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

for a_index, e_index in loo.split(X_train):
    print("TRAIN:", a_index, "TEST:", e_index)
    X_a, X_e = X_train[a_index], X_train[e_index]
    y_a, y_e = y_train[a_index], y_train[e_index]

TRAIN: [1 2 3 4 5] TEST: [0]
TRAIN: [0 2 3 4 5] TEST: [1]
TRAIN: [0 1 3 4 5] TEST: [2]
TRAIN: [0 1 2 4 5] TEST: [3]
TRAIN: [0 1 2 3 5] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


## d. Stratified K-fold

- This applies stratification, which returns stratified folds. The folds are made by preserving the percentage of samples for each class.
- Useful for **small datasets, unbalanced datasets, multiclass classification**.

![](images/stratification.jpg)

The first folds would come from the simple k-fold algorithm, and the other would use stratified folds. Those gray rows have scores a model would get.

In [7]:
data = pd.DataFrame(np.concatenate((X_train, y_train[:, np.newaxis]),1))
data

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,1
2,3,3,1
3,4,4,1
4,5,5,2
5,6,6,2


In [8]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)

for a_index, e_index in skf.split(X_train, y_train):
    print("TRAIN:", a_index, "TEST:", e_index)

TRAIN: [2 3 5] TEST: [0 1 4]
TRAIN: [0 1 3 4] TEST: [2 5]
TRAIN: [0 1 2 4 5] TEST: [3]


## e. Predefined Split

In [9]:
from sklearn.model_selection import PredefinedSplit
# from sklearn.cross_validation import PredefinedSplit ## DEPRECATED

ps = PredefinedSplit(test_fold=([-1,0,0,0,1,1]))  ## only parameter = test_fold, 1 = test data, -1 = not in test

for a_index, e_index in ps.split():
    print("TRAIN:", a_index, "TEST:", e_index)
    X_a, X_e = X_train[a_index], X_train[e_index]
    y_a, y_e = y_train[a_index], y_train[e_index]

TRAIN: [0 4 5] TEST: [1 2 3]
TRAIN: [0 1 2 3] TEST: [4 5]


## Validation strategies that consider a correct splitting

After doing feature engineering, we should get a training and a validation set, because we also do the splitting part there.

In [10]:
X_train = np.array([[1, 1], 
              [2, 2], 
              [3, 3], 
              [4, 4], 
              [5, 5], 
              [6, 6]])
y_train = np.array([1, 1, 1, 1, 2, 2])

X_val = np.array([[7, 7], 
              [8, 8], 
              [9, 9], 
              [10, 10]])
y_val = np.array([2, 2, 3, 3])

# This is for training the final models
X = np.concatenate((X_train, X_val))
y = np.append(y_train, y_val)

# This is for testing the final model
X_test_leaderboard = np.array([[11, 11], 
                              [12, 12], 
                              [13, 13], 
                              [14, 14]])
y_test_leaderboard = np.array([3, 3, 4, 4])

This only serves as input for GridSearch.

It simply separates train and test for the method.

In [11]:
train_ind = np.zeros(X.shape[0])
for i in range(0, len(X_train)):
    train_ind[i] = -1
ps = PredefinedSplit(test_fold=(train_ind))

In [12]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

params = {'alpha':[0.1,0.2,0.3],
         'normalize':[True, False]}

model = Lasso()

from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
score = make_scorer(rmse, greater_is_better=False)

best_model = GridSearchCV(model, params, verbose=3, n_jobs=-1, cv=ps, scoring=score)
best_model.fit(X, y);

Fitting 1 folds for each of 6 candidates, totalling 6 fits
[CV] alpha=0.1, normalize=True .......................................
[CV] alpha=0.1, normalize=False ......................................
[CV] alpha=0.2, normalize=True .......................................
[CV] alpha=0.2, normalize=False ......................................
[CV]  alpha=0.1, normalize=True, score=-0.8502469500376576, total=   0.0s
[CV]  alpha=0.1, normalize=False, score=-0.37552080237434643, total=   0.0s
[CV] alpha=0.3, normalize=True .......................................
[CV]  alpha=0.2, normalize=False, score=-0.5064034403955454, total=   0.0s
[CV]  alpha=0.2, normalize=True, score=-1.2692955176439846, total=   0.0s
[CV] alpha=0.3, normalize=False ......................................
[CV]  alpha=0.3, normalize=True, score=-1.2692955176439846, total=   0.0s
[CV]  alpha=0.3, normalize=False, score=-0.6584734630851129, total=   0.0s


[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished


In [13]:
pred_val = best_model.predict(X_val)
pred_val

array([2.14545455, 2.37575758, 2.60606061, 2.83636364])

In [14]:
print('Train RMSE %f' % rmse(y_train, best_model.predict(X_train)))
print('Val RMSE %f' % rmse(y_val, pred_val))

Train RMSE 0.264361
Val RMSE 0.293392


In [15]:
from sklearn.model_selection import RandomizedSearchCV

In contrast to GridSearchCV, not all parameter values are tried out, but rather a sample from them. I don't think this is useful though.

## Validation problems

- Too little data.
- Diverse or inconsistent data. Very similar samples with different targets.
    - In a time series, for example, a month can have more holydays than previous ones, so using previous months would be unfavorable.

#### How to do more thorough validation

- Increase K
- Tune the model on one set of k-fold splits and evaluate it on the other.