In [1]:
import pandas as pd
import time
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import pprint
# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Integer
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)

In [3]:
# feature augmentation based on lgbm feature importance
gen_feats = [(1, 5),
             (0, 1),
             (2, 1),
             (1, 3),
             (2, 5),
             (0, 2),
             (0, 5),
             (4, 1),
             (5, 4),
             (3, 5),
             (0, 4)]
for i in range(6):
    for j in range(6):
        if (i,j) in gen_feats:
            train['gen_{}_{}'.format(i,j)] = train.iloc[:,i]/train.iloc[:,j]
            test['gen_{}_{}'.format(i,j)] = test.iloc[:,i]/test.iloc[:,j]

In [4]:
# get training features
features = [col for col in train.columns if col != 'label']
num_folds = 10
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=5001)

In [5]:
num_folds = 10
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=5001)
# folds = KFold(n_splits= num_folds, shuffle=True, random_state=5001)

In [6]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title="model", callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time.time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time.time() - start, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [7]:
scoring = make_scorer(roc_auc_score, greater_is_better=True)
cv_strategy = list(folds.split(train[features], train['label']))

params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'device_type': 'cpu',
    'random_state': 5001,
    'verbose': -1,
    }

clf = lgb.LGBMClassifier(**params)

In [8]:
# search space
search_spaces = {
    'learning_rate': Real(0.001, 1.0, 'log-uniform'),
    'n_estimators': Integer(30, 500),
    'num_leaves': Integer(2, 100),
    'max_depth': Integer(-1, 10),
    'subsample': Real(0.2, 1.0, 'uniform'),
    'subsample_freq': Integer(1, 10),
    'colsample_bytree': Real(0.2, 1.0, 'uniform'),
    'reg_lambda': Real(1e-5, 10.0, 'log-uniform'),
    'reg_alpha': Real(1e-5, 10.0, 'log-uniform'),
    'min_child_samples': Integer(1, 15),
    'min_child_weight': Real(1e-5, 10.0, 'log-uniform'),
   }

# bayesian optimizer
opt = BayesSearchCV(estimator=clf,                                    
                    search_spaces=search_spaces,                      
                    scoring=scoring,                           
                    cv=cv_strategy,                                           
                    n_iter=100,
                    n_points=3,
                    n_jobs=-1,
                    iid=False,
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=5001)

In [9]:
# running the optimizer
overdone_control = DeltaYStopper(delta=0.0001)
time_limit_control = DeadlineStopper(total_time=60 * 60 * 0.5)

best_params = report_perf(opt, train[features], train['label'],'LightGBM_Classifier', 
                          callbacks=[overdone_control, time_limit_control])

LightGBM_Classifier took 13.80 seconds,  candidates checked: 21, best CV score: 0.933 ± 0.082
Best parameters:
OrderedDict([('colsample_bytree', 0.5656296878622898),
             ('learning_rate', 0.0266903682740756),
             ('max_depth', 0),
             ('min_child_samples', 2),
             ('min_child_weight', 0.06250203857075702),
             ('n_estimators', 130),
             ('num_leaves', 57),
             ('reg_alpha', 0.0465945527577266),
             ('reg_lambda', 3.8101081958287514e-05),
             ('subsample', 0.6042971686279393),
             ('subsample_freq', 6)])



In [10]:
# best params from search
params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'device_type': 'cpu',
    'random_state': 5001,
    'verbose': -1,
    'learning_rate': 0.0266903682740756,
    'n_estimators': 130,
    'num_leaves': 57,
    'max_depth': -1,
    'subsample': 0.6042971686279393,
    'subsample_freq': 6,
    'colsample_bytree': 0.5656296878622898,
    'reg_lambda': 3.8101081958287514e-05,
    'reg_alpha': 0.0465945527577266,
    'min_child_samples': 2,
    'min_child_weight': 0.06250203857075702,
    }