In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

In [2]:
import gc

In [6]:
def train_model(train_, val_x_, val_y_, test_, y_, folds_):
    feats_ = [f_ for f_ in test_.columns if f_ not in ['SK_ID_CURR']]
    oof_preds = np.zeros(train_.shape[0])
    val_preds = np.zeros(val_x_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    
    feature_importance_df = pd.DataFrame()
    #feats = [f_ for f_ in train_.columns if f_ not in ['SK_ID_CURR']]
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(train_)):
        #print(train_.type)
        trn_x, trn_y = pd.DataFrame(train_).iloc[trn_idx], pd.DataFrame(y_).iloc[trn_idx]
        val_x, val_y = pd.DataFrame(train_).iloc[val_idx], pd.DataFrame(y_).iloc[val_idx]
        
        clf = LGBMClassifier(
            n_estimators = 4000,
            learning_rate = 0.03,
            num_leaves = 30,
            colsample_bytree = .8,
            subsample = .9,
            max_depth = 7,
            reg_alpha = .1,
            min_split_gain = .01,
            min_child_weight = 2,
            silent = -1,
            verbose = -1
            )
        clf.fit(trn_x, trn_y, 
            eval_set = [(trn_x, trn_y), (val_x, val_y)],
            eval_metric = 'auc', verbose = 100, early_stopping_rounds = 100)
        
        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration = clf.best_iteration_)[:, 1]
        val_preds += clf.predict_proba(pd.DataFrame(val_x_[feats_]))[:, 1] / folds_.n_splits
        sub_preds += clf.predict_proba(pd.DataFrame(test_[feats_]), num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        feature_importance_df = 0
        #fold_importance_df = pd.DataFrame()
        #fold_importance_df['feature'] = feats
        #fold_importance_df['importance'] = clf.feature_importances_
        #fold_importance_df['fold'] = n_fold + 1
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('fold %2d validate AUC score %.6f'%(n_fold + 1,roc_auc_score(val_y_, val_preds) * folds_.n_splits))
        print('fold %2d AUC %.6f'%(n_fold+1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
    print('validate AUC score %.6f'%roc_auc_score(val_y_, val_preds))
    print('full AUC score %.6f'%roc_auc_score(y_, oof_preds))
    test_['TARGET'] = sub_preds
    
    return oof_preds, test_[['SK_ID_CURR', 'TARGET']], feature_importance_df

In [4]:
train = pd.read_csv('../data/train_.csv')
test = pd.read_csv('../data/test_.csv')
y = pd.read_csv('../data/y_.csv')
val_x = pd.read_csv('../data/val_x_.csv')
val_y = pd.read_csv('../data/val_y_.csv')
folds = KFold(n_splits=5, shuffle=True, random_state=0)

In [7]:
oof_preds, test_preds, feature_importance_df = train_model(train, val_x, val_y['TARGET'].values.ravel(), test, y['0'].values.ravel(), folds)
test_preds.to_csv('../data/lgbm_submission.csv', index=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978845	valid_1's auc: 0.977379
[200]	training's auc: 0.981584	valid_1's auc: 0.97931
[300]	training's auc: 0.983147	valid_1's auc: 0.980027
[400]	training's auc: 0.984374	valid_1's auc: 0.980381
[500]	training's auc: 0.985452	valid_1's auc: 0.980552
[600]	training's auc: 0.986463	valid_1's auc: 0.98061
[700]	training's auc: 0.987409	valid_1's auc: 0.98064
[800]	training's auc: 0.988266	valid_1's auc: 0.980657
Early stopping, best iteration is:
[789]	training's auc: 0.988182	valid_1's auc: 0.980667
fold  1 AUC 0.980667


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978787	valid_1's auc: 0.977786
[200]	training's auc: 0.981607	valid_1's auc: 0.979625
[300]	training's auc: 0.983175	valid_1's auc: 0.980256
[400]	training's auc: 0.984426	valid_1's auc: 0.980524
[500]	training's auc: 0.98554	valid_1's auc: 0.9807
[600]	training's auc: 0.986581	valid_1's auc: 0.980784
[700]	training's auc: 0.987504	valid_1's auc: 0.980842
[800]	training's auc: 0.988361	valid_1's auc: 0.980893
[900]	training's auc: 0.989139	valid_1's auc: 0.98095
[1000]	training's auc: 0.989875	valid_1's auc: 0.980997
[1100]	training's auc: 0.99051	valid_1's auc: 0.981019
[1200]	training's auc: 0.991151	valid_1's auc: 0.981016
Early stopping, best iteration is:
[1135]	training's auc: 0.990736	valid_1's auc: 0.981029
fold  2 AUC 0.981029


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978577	valid_1's auc: 0.977899
[200]	training's auc: 0.981496	valid_1's auc: 0.97969
[300]	training's auc: 0.983044	valid_1's auc: 0.98035
[400]	training's auc: 0.9843	valid_1's auc: 0.980661
[500]	training's auc: 0.985384	valid_1's auc: 0.980842
[600]	training's auc: 0.986412	valid_1's auc: 0.980924
[700]	training's auc: 0.987371	valid_1's auc: 0.980968
[800]	training's auc: 0.988203	valid_1's auc: 0.980994
[900]	training's auc: 0.988984	valid_1's auc: 0.98102
[1000]	training's auc: 0.989713	valid_1's auc: 0.981038
[1100]	training's auc: 0.990409	valid_1's auc: 0.981075
[1200]	training's auc: 0.990999	valid_1's auc: 0.981084
[1300]	training's auc: 0.99161	valid_1's auc: 0.981099
[1400]	training's auc: 0.992183	valid_1's auc: 0.98112
Early stopping, best iteration is:
[1354]	training's auc: 0.991907	valid_1's auc: 0.981127
fold  3 AUC 0.981127


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978752	valid_1's auc: 0.977937
[200]	training's auc: 0.981548	valid_1's auc: 0.979921
[300]	training's auc: 0.983123	valid_1's auc: 0.980525
[400]	training's auc: 0.984365	valid_1's auc: 0.980828
[500]	training's auc: 0.985459	valid_1's auc: 0.980983
[600]	training's auc: 0.98647	valid_1's auc: 0.981049
[700]	training's auc: 0.987401	valid_1's auc: 0.98108
[800]	training's auc: 0.988223	valid_1's auc: 0.981104
[900]	training's auc: 0.989017	valid_1's auc: 0.981109
[1000]	training's auc: 0.989741	valid_1's auc: 0.981137
[1100]	training's auc: 0.990433	valid_1's auc: 0.981155
[1200]	training's auc: 0.991103	valid_1's auc: 0.981161
[1300]	training's auc: 0.991665	valid_1's auc: 0.981178
[1400]	training's auc: 0.992236	valid_1's auc: 0.981185
[1500]	training's auc: 0.99277	valid_1's auc: 0.981193
Early stopping, best iteration is:
[1411]	training's auc: 0.992297	valid_1's auc: 0.981201
fold  4 AUC 0.9812

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.978839	valid_1's auc: 0.977665
[200]	training's auc: 0.981637	valid_1's auc: 0.979559
[300]	training's auc: 0.983193	valid_1's auc: 0.980186
[400]	training's auc: 0.984432	valid_1's auc: 0.980465
[500]	training's auc: 0.985519	valid_1's auc: 0.980586
[600]	training's auc: 0.986527	valid_1's auc: 0.980655
[700]	training's auc: 0.987459	valid_1's auc: 0.980705
[800]	training's auc: 0.988292	valid_1's auc: 0.98073
[900]	training's auc: 0.989095	valid_1's auc: 0.980744
[1000]	training's auc: 0.989784	valid_1's auc: 0.980744
[1100]	training's auc: 0.990468	valid_1's auc: 0.980757
[1200]	training's auc: 0.991098	valid_1's auc: 0.980763
[1300]	training's auc: 0.991693	valid_1's auc: 0.980768
Early stopping, best iteration is:
[1289]	training's auc: 0.99163	valid_1's auc: 0.980778
fold  5 AUC 0.980778
validate AUC score 0.720602
full AUC score 0.980958
