# Custom Fold for Cross-Validation

In [None]:
def custom_cv_2folds(X):
    n = X.shape[0]
    i = 1
    while i<=2:
        idx = np.arange(n * (i - 1) / 2, n * i / 2, dtype=int)
        yield idx, idx
        i += 1
custom_cv = custom_cv_2folds(X)
cross_val_score(clf, X, y, cv=custom_cv)

# Grid Search + Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [None]:
pipe=Pipeline([
          ('reduce_dim', PCA()), 
          ('clf', SVC())     
    ])

Notice the use of __ for grid search using a pipeline:

In [None]:
param_grid = {
    'reduce_dim__n_components':[2, 5, 10],
    'clf__C':[0.1, 10, 100]
}

In [None]:
grid_search = GridSearchCV(pipe, param_grid=param_grid)

Even more parameter options:

In [None]:
param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)],
                   clf=[SVC(), LogisticRegression()],
                   clf__C=[0.1, 10, 100])

# Hyperparameter Optimization (Sequential)

Loading Preprocessed Data (check the house prices notebook):

In [13]:
train = pd.read_csv('./data/houses_preprocessed_train.csv').iloc[:, 1:]
y = pd.read_csv('./data/houses_log_y.csv')['SalePrice']

## XGBoost

In [37]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold

In [None]:
def find_nrounds_xgb(model, X, y, metrics='rmse',
                     cv_folds=5, early_stopping_rounds=50):
    
    xgtrain = xgb.DMatrix(X, y)
    
    params = model.get_xgb_params()
    
    cvresult = xgb.cv(params, xgtrain, metrics=metrics, 
                      num_boost_round=params['n_estimators'],
                      early_stopping_rounds=early_stopping_rounds)
    
    # Setting optimal number of estimators
    n_rounds_optimal = cvresult.shape[0]
    model.set_params(n_estimators=n_rounds_optimal)
    
    print(cvresult.iloc[-1, :])
    print(f"n_estimators:{n_rounds_optimal}")
    
    return model

In [None]:
def grid_search(estimator, X, y, params, scoring, cv=4, random=True,
                n_iter=150, n_jobs=6):
    
    if random:
        random_search = RandomizedSearchCV(estimator, 
                                           param_distributions=params,
                                           n_iter=n_iter, n_jobs=n_jobs, 
                                           cv=cv, scoring=scoring,
                                           verbose=3, random_state=340)
    
    else:
        random_search = GridSearchCV(estimator, param_grid=params, 
                                       n_jobs=n_jobs, cv=cv,
                                       scoring='neg_mean_absolute_error',
                                       verbose=3)

    random_search.fit(X, y)
    
    return random_search

Very similar to the procedure used below for Lightgbm. Check the house prediction notebook (on tabular-data-techniques) for more details. The order of hyperparameter optimization used was the following:  
- best number of estimators for a baseline model
- max_depth and min_child_weight  
- gamma  
- subsample and colsample_bytree
- adjust number of estimator and learning rate (usually multiplying by 10 and dividing by 10 respectively).

## LightGBM

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold

import lightgbm as lgb

In [32]:
def find_nrounds_lgb(model, X, y, metrics='rmse',
                     cv_folds=5, early_stopping_rounds=50):
    
    lgbtrain = lgb.Dataset(X, y)
    
    params = model.get_params()
    
    # ATTENTION: we need to set stratified to False for regression
    cvresult = lgb.cv(params, lgbtrain, metrics=metrics, 
                      num_boost_round=params['n_estimators'],
                      early_stopping_rounds=early_stopping_rounds,
                      stratified=False)
    
    # Setting optimal number of estimators
    key = list(cvresult.keys())[0]
    n_rounds_optimal = len(cvresult[key])
    model.set_params(n_estimators=n_rounds_optimal)
    
    print(cvresult[key][-1])
    print(f"n_estimators:{n_rounds_optimal}")
    
    return model

In [42]:
def grid_search(estimator, X, y, params, scoring, cv=4, random=True,
                n_iter=150, n_jobs=6):
    
    if random:
        random_search = RandomizedSearchCV(estimator, 
                                           param_distributions=params,
                                           n_iter=n_iter, n_jobs=n_jobs, 
                                           cv=cv, scoring=scoring,
                                           verbose=3, random_state=340)
    
    else:
        random_search = GridSearchCV(estimator, param_grid=params, 
                                       n_jobs=n_jobs, cv=cv,
                                       scoring='neg_mean_absolute_error',
                                       verbose=3)

    random_search.fit(X, y)
    
    return random_search

LightGBM uses leaf-wise tree growth algorithm. For this reason, limiting the number of leaves instead of max_depth might be more appropriate. Like we did when tuning XGBoost, we will start by finding the number of estimators using cross validation and early stopping.

In [37]:
model = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=1000,
                          num_leaves=31, min_child_weight=1)

In [38]:
model = find_nrounds_lgb(model, train, y)

0.12324168104250295
n_estimators:74


First, we find the **number of leaves**. The dataset is quite small, so we will start searching on low values. Let's vary **maximum depth** as well and see if that helps:

In [56]:
param_test = {
 'max_depth':range(3,10,2),
 'num_leaves':range(1,41,2)
}

results = grid_search(model, train, y, param_test,
                      scoring='neg_mean_squared_error', 
                      random=False)

Fitting 4 folds for each of 80 candidates, totalling 320 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    3.1s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:    7.2s
[Parallel(n_jobs=6)]: Done 276 tasks      | elapsed:   17.6s
[Parallel(n_jobs=6)]: Done 320 out of 320 | elapsed:   22.3s finished


In [58]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.08009788273854973
{'max_depth': 9, 'num_leaves': 13}


In [59]:
model.set_params(**results.best_params_);

Instead of min_child_weight like in XGBoost, let's tune **min_data_in_leaf**:

In [62]:
param_test = {
 'min_data_in_leaf':range(5,100,5)
}

results = grid_search(model, train, y, param_test,
                      scoring='neg_mean_squared_error', 
                      random=False)

Fitting 4 folds for each of 19 candidates, totalling 76 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done  76 out of  76 | elapsed:    4.2s finished


In [63]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.08009788273854973
{'min_data_in_leaf': 20}


In [64]:
model.set_params(**results.best_params_);

Unlike XGBoost, we don't have a **gamma** parameter on LightGBM. The similar parameter here is **min_split_gain**:

In [65]:
param_test = {
 'min_split_gain':[i/10.0 for i in range(0,51)]
}

results = grid_search(model, train, y, param_test,
                      scoring='neg_mean_squared_error', 
                      random=False)

Fitting 4 folds for each of 51 candidates, totalling 204 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:    4.1s
[Parallel(n_jobs=6)]: Done 204 out of 204 | elapsed:    7.1s finished


In [66]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.08009788273854973
{'min_split_gain': 0.0}


In [67]:
model.set_params(**results.best_params_);

In [68]:
model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=9,
              min_child_samples=20, min_child_weight=1, min_data_in_leaf=20,
              min_split_gain=0.0, n_estimators=74, n_jobs=-1, num_leaves=13,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

Finally, let's **colsample_bytree** and **subsample**:

In [74]:
param_test = {
 'subsample':[i/10.0 for i in range(5,10)],
 'colsample_bytree':[i/10.0 for i in range(1,10)]
}

results = grid_search(model, train, y, param_test,
                      scoring='neg_mean_squared_error', 
                      random=False)

Fitting 4 folds for each of 45 candidates, totalling 180 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:    4.7s
[Parallel(n_jobs=6)]: Done 180 out of 180 | elapsed:    8.1s finished


In [75]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.07821223535152377
{'colsample_bytree': 0.3, 'subsample': 0.5}


In [76]:
model.set_params(**results.best_params_);

Reducing the learning_rate and increasing the number of estimators:

In [81]:
model.set_params(**{'learning_rate':0.01, 'n_estimators':720})

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
              importance_type='split', learning_rate=0.01, max_depth=9,
              min_child_samples=20, min_child_weight=1, min_data_in_leaf=20,
              min_split_gain=0.0, n_estimators=720, n_jobs=-1, num_leaves=13,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=0.5, subsample_for_bin=200000,
              subsample_freq=0)

Now evaluating the final performance:

In [82]:
kfolds = KFold(n_splits=10)
np.mean(np.sqrt(-cross_val_score(model, train, y, 
                                 scoring="neg_mean_squared_error",
                                 cv=kfolds)))

0.11346970518757978

## CatBoost

https://www.coursera.org/learn/competitive-data-science/lecture/75oIn/catboost-1

# Hyperparameter Optimization (Random)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold

import lightgbm as lgb

In [None]:
def find_nrounds_lgb(model, X, y, metrics='rmse',
                     cv_folds=5, early_stopping_rounds=50):
    
    lgbtrain = lgb.Dataset(X, y)
    
    params = model.get_params()
    
    # ATTENTION: we need to set stratified to False for regression
    cvresult = lgb.cv(params, lgbtrain, metrics=metrics, 
                      num_boost_round=params['n_estimators'],
                      early_stopping_rounds=early_stopping_rounds,
                      stratified=False)
    
    # Setting optimal number of estimators
    key = list(cvresult.keys())[0]
    n_rounds_optimal = len(cvresult[key])
    model.set_params(n_estimators=n_rounds_optimal)
    
    print(cvresult[key][-1])
    print(f"n_estimators:{n_rounds_optimal}")
    
    return model

In [None]:
def grid_search(estimator, X, y, params, scoring, cv=4, random=True,
                n_iter=150, n_jobs=6):
    
    if random:
        random_search = RandomizedSearchCV(estimator, 
                                           param_distributions=params,
                                           n_iter=n_iter, n_jobs=n_jobs, 
                                           cv=cv, scoring=scoring,
                                           verbose=3, random_state=340)
    
    else:
        random_search = GridSearchCV(estimator, param_grid=params, 
                                       n_jobs=n_jobs, cv=cv,
                                       scoring='neg_mean_absolute_error',
                                       verbose=3)

    random_search.fit(X, y)
    
    return random_search

Loading Preprocessed Data (check the house prices notebook):

In [16]:
train = pd.read_csv('./data/houses_preprocessed_train.csv').iloc[:, 1:]
y = pd.read_csv('./data/houses_log_y.csv')['SalePrice']

## LightGBM

Defining the base model and finding the number of estimators:

In [17]:
model = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=1000,
                          num_leaves=31, min_child_weight=1)

In [18]:
model = find_nrounds_lgb(model, train, y)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


0.12324168104250295
n_estimators:74


Parameter Grid for Random Search:

In [35]:
param_test = {
 'num_leaves':range(2,42,2),
 'subsample': np.linspace(0.5, 1, 100),
 'colsample_bytree': np.linspace(0.5, 1, 100),
 'min_data_in_leaf':range(4,52,2),
 'min_split_gain':np.linspace(0.0, 0.5, 100)
}

results = grid_search(model, train, y, param_test,
                      scoring='neg_mean_squared_error', 
                      random=True, n_iter=500)

Fitting 4 folds for each of 500 candidates, totalling 2000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    2.8s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:   17.9s
[Parallel(n_jobs=6)]: Done 276 tasks      | elapsed:   46.0s
[Parallel(n_jobs=6)]: Done 500 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done 1140 tasks      | elapsed:  3.4min
[Parallel(n_jobs=6)]: Done 1556 tasks      | elapsed:  4.9min
[Parallel(n_jobs=6)]: Done 2000 out of 2000 | elapsed:  6.4min finished


In [36]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.013705892792869795
{'subsample': 0.6212121212121212, 'num_leaves': 34, 'min_split_gain': 0.005050505050505051, 'min_data_in_leaf': 34, 'colsample_bytree': 0.5050505050505051}


Using best hyperparameters, reducing learning rate and increasing n_estimators proportionally:

In [37]:
model.set_params(**results.best_params_);

In [38]:
model.set_params(**{'learning_rate':0.01, 'n_estimators':740})

LGBMRegressor(boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.5050505050505051, importance_type='split',
              learning_rate=0.01, max_depth=-1, min_child_samples=20,
              min_child_weight=1, min_data_in_leaf=34,
              min_split_gain=0.005050505050505051, n_estimators=740, n_jobs=-1,
              num_leaves=34, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=0.6212121212121212,
              subsample_for_bin=200000, subsample_freq=0)

Final model performance (very close, but slightly worse than the sequential approach):

In [39]:
kfolds = KFold(n_splits=10)
np.mean(np.sqrt(-cross_val_score(model, train, y, 
                                 scoring="neg_mean_squared_error",
                                 cv=kfolds)))

0.11596549007755608

# Hyperparameter Optimization (Bayesian)

https://roamanalytics.com/2016/09/15/optimizing-the-hyperparameter-of-which-hyperparameter-optimizer-to-use/  

https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf   

https://github.com/lmassaron/kaggledays-2019-gbdt   

https://www.quora.com/What-methods-do-you-prefer-when-performing-hyperparameter-optimization

https://github.com/zygmuntz/hyperband