<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#model-selection" data-toc-modified-id="model-selection-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>model selection</a></span><ul class="toc-item"><li><span><a href="#Finding-the-indices-for-KFold" data-toc-modified-id="Finding-the-indices-for-KFold-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Finding the indices for <code>KFold</code></a></span></li><li><span><a href="#Finding-indices-with-GroupKFold" data-toc-modified-id="Finding-indices-with-GroupKFold-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Finding indices with <code>GroupKFold</code></a></span></li></ul></li></ul></div>

# model selection


In [1]:
import sklearn
import numpy as np


## Finding the indices for `KFold`

The class `KFold` allows us to generate the train and validation indicies for performing training and validation indicies with different parts of our data.


In [2]:
from sklearn.model_selection import KFold
X = np.random.randn(100,4)
X.shape

(100, 4)

In [3]:
folds = KFold(10,shuffle=False)
splits = folds.split(X)
for tr_ind,va_ind in splits:
    print(va_ind)

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24 25 26 27 28 29]
[30 31 32 33 34 35 36 37 38 39]
[40 41 42 43 44 45 46 47 48 49]
[50 51 52 53 54 55 56 57 58 59]
[60 61 62 63 64 65 66 67 68 69]
[70 71 72 73 74 75 76 77 78 79]
[80 81 82 83 84 85 86 87 88 89]
[90 91 92 93 94 95 96 97 98 99]


If shuffle is true then we sample all rows from our dataset randomly

In [5]:

folds = KFold(10,shuffle=True)
splits = folds.split(X)
for tr_ind,va_ind in splits:
    print(va_ind)

[ 1  3 44 49 50 54 55 78 79 98]
[20 21 24 31 40 58 60 73 74 76]
[11 33 35 37 47 61 63 67 85 94]
[14 39 45 48 57 65 83 87 92 97]
[ 6 12 26 27 32 36 75 80 90 93]
[ 4 23 28 30 42 53 66 71 84 89]
[15 34 51 56 59 68 69 70 95 99]
[ 7  8 13 29 52 62 64 88 91 96]
[ 2  5 16 17 18 38 41 43 46 81]
[ 0  9 10 19 22 25 72 77 82 86]


# training models in crosvalidation using GridsearchCV

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.datasets import *

In [21]:
dataset = sklearn.datasets.california_housing.fetch_california_housing()

In [29]:
X = dataset.data
y = dataset.target

X.shape, y.shape

((20640, 8), (20640,))

In [34]:
X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(X,y, random_state=1234)

X_tr.shape, y_tr.shape, X_te.shape, y_te.shape

((15480, 8), (15480,), (5160, 8), (5160,))

In [45]:
from sklearn import ensemble
from sklearn.ensemble import *

rf = sklearn.ensemble.RandomForestRegressor()

space = {"max_depth":[5,10,None], "max_features":["auto",0.5]}

In [49]:
rf_grid = sklearn.model_selection.GridSearchCV(rf, param_grid = space)

In [51]:
rf_grid.fit(X_tr, y_tr)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [5, 10, None]

In [53]:
rf_grid.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [54]:
rf_grid.cv_results_

{'mean_fit_time': array([0.21097239, 0.11494096, 0.3792491 , 0.20135824, 0.58915599,
        0.33011969]),
 'std_fit_time': array([0.00138101, 0.00125028, 0.00246428, 0.00113853, 0.00445221,
        0.00324104]),
 'mean_score_time': array([0.00402459, 0.0036904 , 0.00643118, 0.00665863, 0.01355084,
        0.01404572]),
 'std_score_time': array([2.91338811e-04, 7.12618558e-05, 1.29067986e-04, 1.81564406e-04,
        8.53458622e-04, 1.53478986e-04]),
 'param_max_depth': masked_array(data=[5, 5, 10, 10, None, None],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['auto', 0.5, 'auto', 0.5, 'auto', 0.5],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 5, 'max_features': 'auto'},
  {'max_depth': 5, 'max_features': 0.5},
  {'max_depth': 10, 'max_features': 'auto'},
  {'max_depth': 10, 'max_fea


## Finding indices with `GroupKFold`

In [11]:
from sklearn.model_selection import GroupKFold

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)

2

In [14]:
print(group_kfold)

GroupKFold(n_splits=2)


In [17]:
for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)
    print("\n")

TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [3 4]


TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [3 4] [1 2]


