In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold,cross_val_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
from bayes_opt import BayesianOptimization
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import gc
gc.enable()

In [2]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    print (categorical_columns)
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

print('Read data and test')
data = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
print('Shapes : ', data.shape, test.shape)

y = data['TARGET']
del data['TARGET']
gc.collect()
all_data = pd.concat((data, test)).reset_index(drop=True)

inc_by_org = all_data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
all_data['NEW_INC_BY_ORG'] = all_data['ORGANIZATION_TYPE'].map(inc_by_org)

for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    all_data[bin_feature], uniques = pd.factorize(all_data[bin_feature])
all_data, cat_cols = one_hot_encoder(all_data)
print all_data.shape
# Some simple new features (percentages)
#all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
n_data = data.shape[0]

all_data['DAYS_EMPLOYED_PERC'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
all_data['INCOME_CREDIT_PERC'] = all_data['AMT_INCOME_TOTAL'] / all_data['AMT_CREDIT']
all_data['INCOME_PER_PERSON'] = all_data['AMT_INCOME_TOTAL'] / all_data['CNT_FAM_MEMBERS']
all_data['ANNUITY_INCOME_PERC'] = all_data['AMT_ANNUITY'] / all_data['AMT_INCOME_TOTAL']
all_data['PAYMENT_RATE'] = all_data['AMT_CREDIT'] / all_data['AMT_ANNUITY']

print all_data.shape
del data,test
gc.collect()

bureau = pd.read_csv('./input/bureau.csv')
buro_bal = pd.read_csv('./input/bureau_balance.csv')
buro_bal, bb_cat = one_hot_encoder(buro_bal)
bureau, bureau_cat = one_hot_encoder(bureau)
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max','mean']}
for col in bb_cat:
    bb_aggregations[col] = ['mean']
    bb_agg = buro_bal.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bb_agg['BB_COUNT'] = buro_bal.groupby('SK_ID_BUREAU').size()
print bb_agg.shape
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(columns= 'SK_ID_BUREAU', inplace= True)
del buro_bal,bb_agg
gc.collect()

num_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'CNT_CREDIT_PROLONG': ['sum','mean'],
    'AMT_CREDIT_SUM': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'DAYS_CREDIT_UPDATE': ['min','max','mean'],
    'AMT_ANNUITY': ['max', 'mean'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_MEAN': ['mean'],
    'BB_COUNT': ['mean', 'sum']
}
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
bureau_agg = bureau.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
bureau_agg['BURO_COUNT'] = bureau.groupby('SK_ID_CURR').size()
print bureau_agg.shape

active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
active_agg['ACT_COUNT'] = active.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.reset_index().join(active_agg, how='left', on='SK_ID_CURR')
print bureau_agg.shape
del active, active_agg
gc.collect()

closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(dict({'DAYS_ENDDATE_FACT': ['min','max','mean']},**num_aggregations))
closed_agg.columns = pd.Index(['CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
closed_agg['CLS_COUNT'] = closed.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
print bureau_agg.shape
del closed, closed_agg, bureau
gc.collect()

all_data = all_data.merge(bureau_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del bureau_agg
gc.collect()

prev = pd.read_csv('./input/previous_application.csv')
prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['min','max', 'mean'],
    'AMT_APPLICATION': ['min','max', 'mean'],
    'AMT_CREDIT': ['min','max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'SELLERPLACE_AREA': ['min', 'max', 'mean'],
    'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['min', 'max', 'mean'],
    'DAYS_TERMINATION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum']
}
# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
prev_agg['PREV_COUNT'] = prev.groupby('SK_ID_CURR').size()
print prev_agg.shape
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APR_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
approved_agg['APR_COUNT'] = approved.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.reset_index().join(approved_agg, how='left', on='SK_ID_CURR')
print prev_agg.shape
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REF_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
refused_agg['REF_COUNT'] = refused.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
print prev_agg.shape
del refused, refused_agg, approved, approved_agg, prev
gc.collect()

all_data = all_data.merge(prev_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del prev_agg
gc.collect()

pos = pd.read_csv('./input/POS_CASH_balance.csv')
pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
# Features
aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'CNT_INSTALMENT': ['sum', 'mean'],
    'SK_DPD': ['sum','max','min','mean'],
    'SK_DPD_DEF': ['sum','max', 'min','mean']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
print pos_agg.shape
del pos
gc.collect()

all_data = all_data.join(pos_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del pos_agg
gc.collect()

ins = pd.read_csv('./input/installments_payments.csv')
ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'],
    'DAYS_INSTALMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
print ins_agg.shape
del ins
gc.collect()

all_data = all_data.join(ins_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del ins_agg
gc.collect()

cc = pd.read_csv('./input/credit_card_balance.csv')
cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
# Features: Perform aggregations
num_aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'AMT_BALANCE': ['min','max', 'mean','sum'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['min','max', 'mean','sum','var'],
    'AMT_DRAWINGS_ATM_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_OTHER_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_POS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_INST_MIN_REGULARITY': ['min','max', 'mean','sum','var'],
    'AMT_PAYMENT_CURRENT':['min','max', 'mean','sum'],
    'AMT_PAYMENT_TOTAL_CURRENT':['min','max', 'mean','sum'],
    'AMT_RECEIVABLE_PRINCIPAL':['min','max', 'mean','sum','var'],
    'AMT_RECIVABLE':['min','max', 'mean','sum','var'],
    'AMT_TOTAL_RECEIVABLE':['min','max', 'mean','sum','var'],
    'CNT_DRAWINGS_ATM_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_OTHER_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_POS_CURRENT':['mean','sum'],
    'CNT_INSTALMENT_MATURE_CUM':['mean','sum'],
    'SK_DPD':['max', 'mean', 'sum'],
    'SK_DPD_DEF':['max', 'mean', 'sum']
}
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']
# General aggregations
cc_agg = cc.groupby('SK_ID_CURR').agg(dict(num_aggregations,**cat_aggregations))
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()

#cc_agg['INSTALLMENTS_PER_LOAN'] = cc_agg['CNT_INSTALMENT_MATURE_CUM_SUM'] / cc_agg['CC_COUNT']
del cc
print cc_agg.shape
gc.collect()

all_data = all_data.join(cc_agg, how='left', on='SK_ID_CURR')
print all_data.shape
del cc_agg
gc.collect()

Read data and test
('Shapes : ', (307511, 122), (48744, 121))
['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
(356255, 255)
(356255, 260)
['STATUS']
['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
(817395, 13)
(305811, 67)
(305811, 100)
(305811, 135)
(356255, 394)
['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']
(338857, 208)
(338857, 258)
(338857, 307)
(356255, 700)
['NAME_CONTRACT_STATUS']
(337252, 24)
(356255, 724)
[]
(339587, 29)
(356255, 75

7

In [3]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

drop_col = missing_data[missing_data['Missing Ratio']==100]
missing_data = missing_data[missing_data['Missing Ratio']<100]

all_data.drop(drop_col.index.values,1,inplace=True)

In [3]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

drop_col = missing_data[missing_data['Missing Ratio']==100]
missing_data = missing_data[missing_data['Missing Ratio']<100]

all_data.drop(drop_col.index.values,1,inplace=True)

for i in missing_data.index.values:
    all_data[i] = all_data[i].fillna(-999999999)

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

missing_data

all_data[all_data==np.inf]=999999999

In [4]:
data = all_data[:n_data]
test = all_data[n_data:]
#del avg_buro, avg_prev,avg_pos,avg_cc_bal,avg_inst
del all_data
gc.collect()
print data.shape
print test.shape

exclude_fea = []
exclude_fea.append('SK_ID_CURR') 
exclude_fea

(307511, 819)
(48744, 819)


['SK_ID_CURR']

In [5]:
folds = KFold(n_splits=5, shuffle=True, random_state=1700)
oof_preds = np.zeros(data.shape[0])
sub_preds = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in data.columns if f not in exclude_fea]

In [6]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
    trn_x, trn_y = data[feats].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = data[feats].iloc[val_idx], y.iloc[val_idx]
    
    clf = LogisticRegression(solver='sag',max_iter=1000,verbose=10,n_jobs=-1,C=10,random_state=1700)

    clf.fit(trn_x, trn_y)
    
    oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
    sub_preds += clf.predict_proba(test[feats])[:, 1] / folds.n_splits
    print('Fold %2d Train AUC : %.6f' % (n_fold + 1, roc_auc_score(trn_y, clf.predict_proba(trn_x)[:, 1])))
    print('Fold %2d Test AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
    if n_fold == 0:
        break
    
print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 


max_iter reached after 1447 seconds


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 24.1min


KeyboardInterrupt: 

In [8]:
def lgbcv(num_leaves, colsample_bytree,subsample,reg_alpha,reg_lambda,min_split_gain,min_child_weight):
    folds = KFold(n_splits=5, shuffle=True, random_state=1700)
    feats = [f for f in data.columns if f not in exclude_fea]
    oof_preds = np.zeros(data.shape[0])
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
        trn_x, trn_y = data[feats].iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = data[feats].iloc[val_idx], y.iloc[val_idx]

        clf = LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.05,
        num_leaves=int(num_leaves),
        colsample_bytree=max(min(colsample_bytree, 1), 0),
        subsample=max(min(subsample, 1), 0),
        max_depth=8,
        reg_alpha=max(reg_alpha, 0),
        reg_lambda=max(reg_lambda, 0),
        min_split_gain=max(min_split_gain, 0),
        min_child_weight=max(min_child_weight, 0),
        random_state=1700,
        )
        clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
           )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
    full_auc = roc_auc_score(y, oof_preds)
    print('Full AUC score %.6f' % full_auc) 
    return(full_auc)

In [9]:
lgbBO = BayesianOptimization(
        lgbcv,
        {'num_leaves': (20, 100),
        'colsample_bytree': (0.5, 1.0),
        'subsample':(0.5,1.0),
        'reg_alpha':(0.001,10),
        'reg_lambda':(0.001,10),
        'min_split_gain':(0.0001,10),
        'min_child_weight':(20,120)}
    )

In [10]:
lgbBO.maximize(init_points=2,n_iter=15)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.790898	valid_1's auc: 0.771298
[200]	training's auc: 0.809141	valid_1's auc: 0.783415
[300]	training's auc: 0.814748	valid_1's auc: 0.786341
[400]	training's auc: 0.81748	valid_1's auc: 0.787469
[500]	training's auc: 0.819464	valid_1's auc: 0.788358
[600]	training's auc: 0.820843	valid_1's auc: 0.788806
[700]	training's auc: 0.821864	valid_1's auc: 0.788962
[800]	training's auc: 0.822801	valid_1's auc: 0.789174
[900]	training's auc: 0.823705	valid_1's auc: 0.789364
[1000]	training's auc: 0.824642	valid_1's auc: 0.789726
[1100]	training's auc: 0.824906	valid_1's auc: 0.789708

Early stopping, best iteration is:
[387]	training's auc: 0.807875	valid_1's auc: 0.789754
Fold  2 AUC : 0.789754
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.791622	valid_1's auc: 0.77343
[200]	training's auc: 0.80586	valid_1's auc: 0.783218
[300]	training's auc: 0.808545	valid_1's auc: 0.785186
[400]	training's auc: 0.809688	valid_1's auc: 0.786002
[500]	training's auc: 0.809912	valid_1's auc: 0.786153
Early stopping, best iteration is:
[442]	training's auc: 0.809904	valid_1's auc: 0.786159
Fold  3 AUC : 0.786153
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.791617	valid_1's auc: 0.774153
[200]	training's auc: 0.806038	valid_1's auc: 0.784385
[300]	training's auc: 0.808722	valid_1's auc: 0.786473
[400]	training's auc: 0.809237	valid_1's auc: 0.786783
Early stopping, best iteration is:
[335]	training's auc: 0.809194	valid_1's auc: 0.786799
Fold  4 AUC : 0.786799
Training until validation scores don't imp

[700]	training's auc: 0.897999	valid_1's auc: 0.79581
Early stopping, best iteration is:
[610]	training's auc: 0.889925	valid_1's auc: 0.79633
Fold  2 AUC : 0.796330
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.816266	valid_1's auc: 0.777684
[200]	training's auc: 0.842446	valid_1's auc: 0.787825
[300]	training's auc: 0.857172	valid_1's auc: 0.789998
[400]	training's auc: 0.869414	valid_1's auc: 0.791082
[500]	training's auc: 0.880152	valid_1's auc: 0.79132
[600]	training's auc: 0.88969	valid_1's auc: 0.791373
Early stopping, best iteration is:
[589]	training's auc: 0.888937	valid_1's auc: 0.791433
Fold  3 AUC : 0.791433
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.815968	valid_1's auc: 0.777774
[200]	training's auc: 0.842581	valid_1's auc: 0.788499
[300]	training's auc: 0.85693	valid_1's auc: 0.791017
[400]	training's auc: 0.869361	valid_1's auc: 0.791582
[500]	training's auc: 0.879847	valid_1's auc: 0.

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.800262	valid_1's auc: 0.771963
[200]	training's auc: 0.82821	valid_1's auc: 0.785056
[300]	training's auc: 0.845556	valid_1's auc: 0.788722
[400]	training's auc: 0.860672	valid_1's auc: 0.790755
[500]	training's auc: 0.873508	valid_1's auc: 0.791733
[600]	training's auc: 0.885496	valid_1's auc: 0.791851
Early stopping, best iteration is:
[548]	training's auc: 0.879293	valid_1's auc: 0.791942
Fold  1 AUC : 0.791942
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.798403	valid_1's auc: 0.777946
[200]	training's auc: 0.827326	valid_1's auc: 0.790531
[300]	training's auc: 0.844646	valid_1's auc: 0.793879
[400]	training's auc: 0.858799	valid_1's auc: 0.795587
[500]	training's auc: 0.871574	valid_1's auc: 0.795953
[600]	training's auc: 0.883237	valid_1's auc: 0.796335
[700]	training's auc: 0.894244	valid_1's auc: 0.796303
Early stopping, best iteration is:
[608]	traini

[200]	training's auc: 0.806705	valid_1's auc: 0.785065
[300]	training's auc: 0.819436	valid_1's auc: 0.789643
[400]	training's auc: 0.829244	valid_1's auc: 0.791402
[500]	training's auc: 0.837522	valid_1's auc: 0.792132
[600]	training's auc: 0.845067	valid_1's auc: 0.792764
[700]	training's auc: 0.852202	valid_1's auc: 0.793399
[800]	training's auc: 0.858625	valid_1's auc: 0.793588
[900]	training's auc: 0.864715	valid_1's auc: 0.793669
Early stopping, best iteration is:
[831]	training's auc: 0.860524	valid_1's auc: 0.793738
Fold  4 AUC : 0.793738
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.784168	valid_1's auc: 0.765619
[200]	training's auc: 0.807559	valid_1's auc: 0.780592
[300]	training's auc: 0.820025	valid_1's auc: 0.785538
[400]	training's auc: 0.830176	valid_1's auc: 0.788019
[500]	training's auc: 0.838719	valid_1's auc: 0.789129
[600]	training's auc: 0.846344	valid_1's auc: 0.789795
[700]	training's auc: 0.853291	valid_1's auc: 0.790337

KeyboardInterrupt: 

In [11]:
lgbBO.res['max']

{'max_params': {'colsample_bytree': 0.5802478232647209,
  'min_child_weight': 20.33692569448104,
  'min_split_gain': 0.7649917884568475,
  'num_leaves': 21.152379900895014,
  'reg_alpha': 9.756996480309178,
  'reg_lambda': 8.776694575019713,
  'subsample': 0.9370453331986632},
 'max_val': 0.793068381666508}

In [None]:
colsample_bytree |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 