<div class="alert alert-block alert-info">
    <h1 align="center">SKLearn Time Series Cross Validation & Grid Search</h1>
    <h3 align="center"> Machine Learning Course</h3>
    <h5 align="center">Alireza Aghamohammadi (http://linkedin.com/in/aaghamohammadi)</h5>
    <h5 align="center">Mohammad Talaei (https://www.linkedin.com/in/mohammad-talaei-9a4742150/)</h5>
</div>

#### Load Dependencies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split

#### Set Configuration

In [2]:
plt.style.use('seaborn')

#### Set Hyperparameters

In [58]:
PATH_DATA = "../data/"

#### Load Data

In [4]:
house = pd.read_csv(PATH_DATA + "boston_house_prices.csv", header=1)
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
x = house.drop(house.MEDV.name, axis=1)
y = house.MEDV

#### Creating time series folds

The conventional form of cross validation (KFold) is shown below

<img src="../img/Kfold.png"/>

But when it comes to time series, the test set should always be more recent than train set
So there are two methods for creating time series folds:

##### Time series split

<img src="../img/TS.png"/>

In [6]:
def TS_crossval(X,y,k=5):
    wsize = X.shape[0]//(k+1)
    train_lb = 0
    train_ub = train_lb + wsize
    X_train={}; y_train={}
    X_test={}; y_test={}
    for i in range(1,k):
        #train_ub = train_lb + wsize
        X_train[i] = X[train_lb:train_ub]; y_train[i] = y[train_lb:train_ub]
        X_test[i] = X[train_ub:train_ub + wsize]; y_test[i] = y[train_ub:train_ub + wsize]
        train_ub += wsize
    
    X_train[i+1] = X[train_lb:train_ub]; y_train[i+1] = y[train_lb:train_ub]
    X_test[i+1] = X[train_ub:]; y_test[i+1] = y[train_ub:]
    return X_train,X_test,y_train,y_test

##### Blocking time series split

<img src="../img/TS_block.png"/>

In [7]:
def TS_crossval_block(X,y,k=5):
    wsize = X.shape[0]//(k+1)
    train_lb = 0
    train_ub = train_lb + wsize
    X_train={}; y_train={}
    X_test={}; y_test={}
    for i in range(1,k):
        train_ub = train_lb + wsize
        X_train[i] = X[train_lb:train_ub]; y_train[i] = y[train_lb:train_ub]
        X_test[i] = X[train_ub:train_ub + wsize]; y_test[i] = y[train_ub:train_ub + wsize]
        train_lb += wsize
    
    train_ub = train_lb + wsize
    X_train[i+1] = X[train_lb:train_ub]; y_train[i+1] = y[train_lb:train_ub]
    X_test[i+1] = X[train_ub:]; y_test[i+1] = y[train_ub:]
    return X_train,X_test,y_train,y_test

In [62]:
folds = 5
X_train,X_test,y_train,y_test = TS_crossval(x,y,folds)

In [63]:
X_train[1]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0.08387,0.0,12.83,0,0.437,5.874,36.6,4.5026,5,398,18.7,396.06,9.10
80,0.04113,25.0,4.86,0,0.426,6.727,33.5,5.4007,4,281,19.0,396.90,5.29
81,0.04462,25.0,4.86,0,0.426,6.619,70.4,5.4007,4,281,19.0,395.63,7.22
82,0.03659,25.0,4.86,0,0.426,6.302,32.2,5.4007,4,281,19.0,396.90,6.72


In [64]:
X_test[1]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
84,0.05059,0.0,4.49,0,0.449,6.389,48.0,4.7794,3,247,18.5,396.90,9.62
85,0.05735,0.0,4.49,0,0.449,6.630,56.1,4.4377,3,247,18.5,392.30,6.53
86,0.05188,0.0,4.49,0,0.449,6.015,45.1,4.4272,3,247,18.5,395.99,12.86
87,0.07151,0.0,4.49,0,0.449,6.121,56.8,3.7476,3,247,18.5,395.15,8.44
88,0.05660,0.0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.90,5.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,1.51902,0.0,19.58,1,0.605,8.375,93.9,2.1620,5,403,14.7,388.45,3.32
164,2.24236,0.0,19.58,0,0.605,5.854,91.8,2.4220,5,403,14.7,395.11,11.64
165,2.92400,0.0,19.58,0,0.605,6.101,93.0,2.2834,5,403,14.7,240.16,9.81
166,2.01019,0.0,19.58,0,0.605,7.929,96.2,2.0459,5,403,14.7,369.30,3.70


#### Grid Search for Time Series

In [70]:
param_grid = {"max_features": [0.8,0.9,1], "max_depth":[3,5,7,12,None]}

In [71]:
rf = RandomForestRegressor(n_estimators=200,random_state=7)

In [72]:
grid_search = GridSearchCV(rf, param_grid=param_grid, 
                           cv=[(list(X_train[i].index),list(X_test[i].index)) for i in range(1,folds+1)])

In [73]:
grid_search.fit(x,y);

In [75]:
Grid_cv=pd.DataFrame(grid_search.cv_results_)
Grid_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.285607,0.03205,0.017594,0.000495,3.0,0.8,"{'max_depth': 3, 'max_features': 0.8}",0.615696,0.636308,0.896278,0.194605,0.443036,0.557185,0.232018,10
1,0.293603,0.018516,0.018401,0.001023,3.0,0.9,"{'max_depth': 3, 'max_features': 0.9}",0.62882,0.618112,0.900832,0.19449,0.469924,0.562436,0.230635,9
2,0.254997,0.017838,0.018403,0.001019,3.0,1.0,"{'max_depth': 3, 'max_features': 1}",0.216464,0.155877,0.374979,-0.011712,0.165589,0.180239,0.124008,15
3,0.312798,0.032194,0.018405,0.001744,5.0,0.8,"{'max_depth': 5, 'max_features': 0.8}",0.608542,0.63643,0.900272,0.303213,0.418777,0.573447,0.204432,4
4,0.325597,0.049837,0.018403,0.000802,5.0,0.9,"{'max_depth': 5, 'max_features': 0.9}",0.623749,0.629895,0.901209,0.292835,0.395565,0.568651,0.211383,6
5,0.247799,0.008638,0.019001,0.001789,5.0,1.0,"{'max_depth': 5, 'max_features': 1}",0.226321,0.298902,0.536987,0.122644,0.289854,0.294942,0.136398,14
6,0.343006,0.049693,0.018394,0.000794,7.0,0.8,"{'max_depth': 7, 'max_features': 0.8}",0.605877,0.645998,0.899344,0.299346,0.379855,0.566084,0.212067,8
7,0.357415,0.051829,0.018789,0.001162,7.0,0.9,"{'max_depth': 7, 'max_features': 0.9}",0.61642,0.633769,0.897568,0.31512,0.407754,0.574126,0.202349,3
8,0.258397,0.017606,0.018599,0.000792,7.0,1.0,"{'max_depth': 7, 'max_features': 1}",0.249128,0.382445,0.603266,0.152483,0.328229,0.34311,0.151408,12
9,0.391203,0.082366,0.018996,0.00126,12.0,0.8,"{'max_depth': 12, 'max_features': 0.8}",0.607798,0.638381,0.893856,0.302734,0.389488,0.566451,0.207336,7


In [74]:
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': 12, 'max_features': 0.9}


#### *What we learned*

* We learned how to do cross validation for time series and how to use it for grid search 