In [1]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

In [2]:
application_train = pd.read_csv('/Users/fairy/Documents/data science/kaggle_summer18/code/input/application_train.csv')
from sklearn.preprocessing import LabelEncoder
def label_encoder(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """
    # Label encode categoricals
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    return input_df, categorical_feats.tolist(), encoder_dict
application_train, categorical_feats, encoder_dict = label_encoder(application_train)
X = application_train.drop('TARGET', axis=1)
y = application_train.TARGET

### Step 1: parameters to be tuned
**Note**: values for parameters should make sense, e.g.: 'num_leaves' needs to be a integer and 'feature_fraction' should between 0 and 1

In [3]:
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'application':'binary','num_iterations':4000, 'learning_rate':0.05, 'early_stopping_round':100, 'metric':'auc'}
    params["num_leaves"] = round(num_leaves)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(max_depth)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
    return max(cv_result['auc-mean'])

### Step 2: Set the range for each parameter
**Gentle reminder**: try to make the range as narrow as possible

In [4]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (35, 65),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=0)

### Step 3: Bayesian Optimization: Maximize

In [7]:
#lgbBO.maximize(init_points=init_round, n_iter=opt_round)

### Step 4: Get the parameters

In [8]:
# lgbBO.res['max']['max_params']

### Put all together
**Note**: It is just a demo. To get a better result, you should increase initial rounds, optimization rounds and n_estimators

In [5]:
X = application_train.drop('TARGET', axis=1)
y = application_train.TARGET
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.05, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, categorical_feature = categorical_feats, free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (35, 65),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.res['max']['max_params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=5, random_seed=0, n_estimators=10000, learning_rate=0.02)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   bagging_fraction |   feature_fraction |   lambda_l1 |   lambda_l2 |   max_depth |   min_child_weight |   min_split_gain |   num_leaves | 
[200]	cv_agg's auc: 0.750692 + 0.00285883
[400]	cv_agg's auc: 0.75686 + 0.00247826
[600]	cv_agg's auc: 0.758796 + 0.00219121
[800]	cv_agg's auc: 0.75983 + 0.00201034
[1000]	cv_agg's auc: 0.760414 + 0.00200279
[1200]	cv_agg's auc: 0.760633 + 0.00198488
    1 | 03m48s | [35m   0.76068[0m | [32m            0.9583[0m | [32m            0.6167[0m | [32m     4.8931[0m | [32m     1.9198[0m | [32m     5.3476[0m | [32m           32.7936[0m | [32m          0.0272[0m | [32m     51.4644[0m | 
[200]	cv_agg's auc: 0.751518 + 0.002753
[400]	cv_agg's auc: 0.757189 + 0.00235968
[600]	cv_agg's auc: 0.759099 + 0.00214105
[

In [2]:
print(opt_params)

NameError: name 'opt_params' is not defined