<img src="../../img/mldlc2.png" width="900">

## KFold and StratifiedKFold train test split

In [3]:
from sklearn.model_selection import KFold
import numpy as np

X = np.array([[1, 2], [3, 4], [4, 5], [4, 4], [5, 6], [6, 7], [8, 9], [9, 10], [11, 12]])
Y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])

# 3 Fold split5
kf = KFold(n_splits=3, shuffle=True, random_state=101)
for train_indices, test_indices in kf.split(X, Y):
   print("train indices:", train_indices, "test indices:", test_indices)
   X_train, X_test = X[train_indices], X[test_indices]
   Y_train, Y_test = Y[train_indices], Y[test_indices]

train indices: [1 3 4 5 6 8] test indices: [0 2 7]
train indices: [0 1 2 5 6 7] test indices: [3 4 8]
train indices: [0 2 3 4 7 8] test indices: [1 5 6]


In [4]:
#StratifiedKFold distributes the target labels within fold in same ratio in which they appear in main dataset

from sklearn.model_selection import StratifiedKFold

# TBD: Use StratifiedKFold to split X and Y into 3 fold train test set and verify that all the three target labels (0, 1, 2) are present in each fold

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=101)
print("Where printout is Y label of either [0 1 2]")
for train_indices, test_indices in skf.split(X, Y):
   print("train indices:", np.bincount(Y[train_indices]), "test indices:", np.bincount(Y[test_indices]))
   X_train, X_test = X[train_indices], X[test_indices]
   Y_train, Y_test = Y[train_indices], Y[test_indices]

Where printout is Y label of either [0 1 2]
train indices: [2 2 2] test indices: [1 1 1]
train indices: [2 2 2] test indices: [1 1 1]
train indices: [2 2 2] test indices: [1 1 1]


## GridSearch CV (Search for a best set of hyperparams for a given model)

In [5]:
# Load Boston housing dataset

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

X, Y = load_boston(return_X_y=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=101)

model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)
print(mean_squared_error(Y_test, Y_hat))

51.9764391951006


In [6]:
import numpy as np
from sklearn.model_selection import GridSearchCV

def grid_summary(grid_result):
    print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print(f"{mean},{stdev} with: {param}")  

## TBD Using GridSearchCV check which value of param n_neighbors [2,3,4,5,6,7,8] gives the best results

parameters = {'n_neighbors':[2,3,4,5,6,7,8]}
gs = GridSearchCV(KNeighborsRegressor(), parameters)
gs.fit(X, Y)
grid_summary(gs)

model = KNeighborsRegressor(n_neighbors=8)
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)
print(mean_squared_error(Y_test, Y_hat))

Best: -0.2893603144554894 using {'n_neighbors': 8}
-0.472560526236681,0.6315156892215512 with: {'n_neighbors': 2}
-0.3805458677224257,0.5212934092505351 with: {'n_neighbors': 3}
-0.320685224628445,0.4690431891197771 with: {'n_neighbors': 4}
-0.31501646812514134,0.4401304871369981 with: {'n_neighbors': 5}
-0.32567751357459207,0.42432013741134633 with: {'n_neighbors': 6}
-0.30533542609085224,0.411918977909927 with: {'n_neighbors': 7}
-0.2893603144554894,0.41702218187230644 with: {'n_neighbors': 8}
53.535024606299196


In [7]:
## TBD Using GridSearchCV check which value of param combination n_neighbors [2,3,4,5,6,7,8], p [1, 2] gives the best result

parameters = {'n_neighbors':[2,3,4,5,6,7,8], 'p':[1,2]}
gs2 = GridSearchCV(KNeighborsRegressor(), parameters)
gs2.fit(X, Y)
grid_summary(gs2)

model = KNeighborsRegressor(n_neighbors=4, p=1)
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)
print(mean_squared_error(Y_test, Y_hat))

Best: -0.18954795135784916 using {'n_neighbors': 4, 'p': 1}
-0.22813976586145773,0.37412693921633206 with: {'n_neighbors': 2, 'p': 1}
-0.472560526236681,0.6315156892215512 with: {'n_neighbors': 2, 'p': 2}
-0.19870846489933208,0.27283333973737306 with: {'n_neighbors': 3, 'p': 1}
-0.3805458677224257,0.5212934092505351 with: {'n_neighbors': 3, 'p': 2}
-0.18954795135784916,0.2697094749115022 with: {'n_neighbors': 4, 'p': 1}
-0.320685224628445,0.4690431891197771 with: {'n_neighbors': 4, 'p': 2}
-0.21680588831552852,0.27043478274620825 with: {'n_neighbors': 5, 'p': 1}
-0.31501646812514134,0.4401304871369981 with: {'n_neighbors': 5, 'p': 2}
-0.23357953948492266,0.27525142963576954 with: {'n_neighbors': 6, 'p': 1}
-0.32567751357459207,0.42432013741134633 with: {'n_neighbors': 6, 'p': 2}
-0.21520341623298894,0.2713440687332639 with: {'n_neighbors': 7, 'p': 1}
-0.30533542609085224,0.411918977909927 with: {'n_neighbors': 7, 'p': 2}
-0.21537623850471013,0.2718811820617129 with: {'n_neighbors': 8, 

## Gridsearch across different algorithms

## TBD Create a ML pipeline that selects the best model-param combination among given set of models and params
* LinearRegression, No params
* KNeighborsRegressor, params: {n_neighbors : [4,5,6], p: [1,2]}
* XGBoost, params : {}


In [26]:
X, Y = load_boston(return_X_y=True)

# [(model_1, param_1), (model_2, param_2), ...]
def model_param_optimiser(models):
    searches = []
    top = 0
    for model in models:
        gs = GridSearchCV(model[0], model[1], scoring='neg_mean_squared_error')
        gs.fit(X_train, Y_train)
        searches.append(gs)
    for i in range(len(searches)):
        if (abs(searches[i].best_score_) < abs(searches[top].best_score_)):
            top = i
    print("Top Score:" + str(searches[top].best_score_))
    print("Model:" + str(models[top][0]))
    print("Params:" + str(searches[top].best_params_))

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

no_param = {}
parameters_knn = {'n_neighbors':[4,5,6], 'p':[1,2]}
parameters_xgboost = {'n_estimators': [50,100,200], 'max_depth': [2,3,4,5,6,7], 'subsample': [0.9, 1.0, 1.1] }
models = [(LinearRegression(), no_param), (KNeighborsRegressor(), parameters_knn), (GradientBoostingRegressor(), parameters_xgboost)]

model_param_optimiser(models)

Top Score:-9.342108246852757
Model:GradientBoostingRegressor()
Params:{'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}


In [28]:
from sklearn.pipeline import  Pipeline
pipe  = Pipeline([('model', LinearRegression())])
param_grid = [
              {'model' : [LinearRegression()]},
              {'model' : [KNeighborsRegressor()], 'model__n_neighbors': [2,3,4], 'model__p': [1,2]},
              {'model' : [GradientBoostingRegressor()], 'model__n_estimators': [50,100,200], 'model__max_depth': [2,3,4,5,6,7] }
            ]

grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring = 'neg_mean_squared_error')

res = grid.fit(X_train, Y_train)
grid_summary(res)

Best: -11.147465290168133 using {'model': GradientBoostingRegressor(n_estimators=200), 'model__max_depth': 3, 'model__n_estimators': 200}
-23.27512074085658,5.893867833178617 with: {'model': LinearRegression()}
-37.98679633795775,8.188123345748794 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 2, 'model__p': 1}
-43.791816231304416,6.737895664076707 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 2, 'model__p': 2}
-34.8176881362052,8.340147227383387 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 3, 'model__p': 1}
-40.206412369750076,9.996662292923505 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 3, 'model__p': 2}
-35.03843062585927,9.585274537229507 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 4, 'model__p': 1}
-39.84197515675124,11.012985445863649 with: {'model': KNeighborsRegressor(), 'model__n_neighbors': 4, 'model__p': 2}
-16.466610086916226,2.8767255760957626 with: {'model': GradientBoostingRegressor(n_estimato