# Raiffeisen - Classification LightGBM

In [1]:
# !pip3 install lightgbm

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

import itertools

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, validation_curve, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.multiclass import unique_labels

pd.options.display.max_columns = 400

## Functions

In [3]:
# Plot learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Validation score")

    plt.legend(loc="best")
    return plt

In [4]:
# Plot validation curve
def plot_validation_curve(estimator, title, X, y, param_name, param_range, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), xscale='linear'):
    train_scores, test_scores = validation_curve(estimator, X, y, param_name, param_range, cv)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, color='r', marker='o', markersize=5, label='Training score')
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='r')
    plt.plot(param_range, test_mean, color='g', linestyle='--', marker='s', markersize=5, label='Validation score')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='g')
    plt.grid() 
    plt.xscale(xscale)
    plt.legend(loc='best') 
    plt.xlabel('Parameter') 
    plt.ylabel('Score') 
    plt.ylim(ylim)

In [5]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues,
                          figsize=(5,5)):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots(figsize=figsize)
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label',
           xlim=(-0.5, 1.5),
           ylim=(1.5, -0.5))

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

## Load data

In [6]:
%%time

df = pd.read_csv('RFB - Merged Data.csv', index_col=0)

df.shape

CPU times: user 16 s, sys: 7 s, total: 23 s
Wall time: 18.6 s


(555844, 316)

## Feature selection

In [7]:
cl_columns = [
    'CL_DB_TXN_CNT_m1',
    'CL_DB_TXN_AMT_m1',
    'CL_CR_TXN_CNT_m1',
    'CL_CR_TXN_AMT_m1',
    'CL_EOM_BAL_AMT_m1',
    'CL_AVG_BAL_AMT_m1',
    'CL_ACC_USED_CNT_m1',
    'CL_TOT_TXN_CNT_m1',
    'CL_TOT_TXN_AMT_m1',
    'CL_ACCT_OPENED_CNT',
    'CL_ACCT_CLOSED_CNT',
    'CL_MATURITY_MTH',
    'CL_MAX_TENURE_MTH',
    'CL_MIN_TENURE_MTH',
    'CL_MTH_SINCE_LAST_OPEN',
    'CL_FLG_EVER',
    'CL_FLG_m1',
    'CL_ACCT_CNT_m1',
    'CL_ACCT_OPEN_CNT_m1',
    'CL_ACCT_CLOSE_CNT_m1',
    'CL_MTH_SINCE_LAST_OPEN_MISSING',
    'CL_MATURITY_MTH_MISSING'
]
incl_columns = [
    'CUSTOMER_RK',
    'FIRST_OPEN_PRODUCT',
    'CITY',
    'MOBILE_FLG',
    'EMAIL_FLG',
    'IS_SEGMENT_CD',
    'BRANCH_CD',
    'REGION_CD',
    'GEO_REGION_CD',
    'AGE_AT_ANALYSIS_MTH_CNT',
    'GENDER_CD',
    'DC_MIX',
    'CC_MIX',
    'ROL_flg_m1',
    'MOB_ACTIVE',
    'BizSeg',
    'month',
    'month_od',
    'month_do',
    'mBanking_logs_m1',
    'eBanking_logs_m1',
    'MARITAL_STATUS',
    'Education_level',
    'RESIDENCE_STATUS',
    'EMPLOYMENT_TYPE',
    'cpi_consumer',
    'cpi_excluding',
    'cash_loan',
    'grad',
    'F',
    'P',
    'R',
    'broj_prijava',
    'broj_obveznika'
]
# ds = df[df.DAYS_SINCE_FIRST_ACCOUNT_OPEN<180].drop(cl_columns, axis=1)
ds = df[df.DAYS_SINCE_FIRST_ACCOUNT_OPEN<180][incl_columns]
ds.shape

(219403, 34)

In [8]:
ds.set_index(['CUSTOMER_RK', 'month'], inplace=True)
ds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FIRST_OPEN_PRODUCT,CITY,MOBILE_FLG,EMAIL_FLG,IS_SEGMENT_CD,BRANCH_CD,REGION_CD,GEO_REGION_CD,AGE_AT_ANALYSIS_MTH_CNT,GENDER_CD,DC_MIX,CC_MIX,ROL_flg_m1,MOB_ACTIVE,BizSeg,month_od,month_do,mBanking_logs_m1,eBanking_logs_m1,MARITAL_STATUS,Education_level,RESIDENCE_STATUS,EMPLOYMENT_TYPE,cpi_consumer,cpi_excluding,cash_loan,grad,F,P,R,broj_prijava,broj_obveznika
CUSTOMER_RK,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
427759,201506,LEN,beograd-novi beograd,0,0,INDV,161,Grad,BEOGRAD 1,62.0,F,Unknown,Unknown,0,4.0,Webovci,201503,201609,0,0,Married,2. Elementary School,Ownership,Unemployed,1.9,0.6,nije u prvih 6,beograd-novi beograd,163.517,18.304,6.614,122769,100725
427759,201505,LEN,beograd-novi beograd,0,0,INDV,161,Grad,BEOGRAD 1,62.0,F,Unknown,Unknown,0,3.0,Webovci,201503,201609,0,0,Married,2. Elementary School,Ownership,Unemployed,1.5,0.1,nije u prvih 6,beograd-novi beograd,163.517,18.304,6.614,122769,100725
427759,201507,LEN,beograd-novi beograd,0,0,INDV,161,Grad,BEOGRAD 1,62.0,F,Unknown,Unknown,0,5.0,Webovci,201503,201609,0,0,Married,2. Elementary School,Ownership,Unemployed,1.0,0.5,nije u prvih 6,beograd-novi beograd,163.517,18.304,6.614,122769,100725
427759,201503,LEN,beograd-novi beograd,0,0,INDV,161,Grad,BEOGRAD 1,62.0,F,Unknown,Unknown,0,1.0,Webovci,201503,201609,0,0,Married,2. Elementary School,Renting,Unemployed,1.9,0.0,nije u prvih 6,beograd-novi beograd,163.517,18.304,6.614,122769,100725
427759,201508,LEN,beograd-novi beograd,0,0,INDV,161,Grad,BEOGRAD 1,63.0,F,Unknown,Unknown,0,6.0,Webovci,201503,201609,0,0,Married,2. Elementary School,Ownership,Unemployed,2.1,0.2,nije u prvih 6,beograd-novi beograd,163.517,18.304,6.614,122769,100725


## Feature transformation

In [9]:
le = LabelEncoder()

# ds.PD_MIX_ACTIVE_m1 = le.fit_transform(ds.PD_MIX_ACTIVE_m1)
ds.FIRST_OPEN_PRODUCT = le.fit_transform(ds.FIRST_OPEN_PRODUCT)
# ds.FIRST_CLOSED_PRODUCT_6MTH = le.fit_transform(ds.FIRST_CLOSED_PRODUCT_6MTH)
# ds.LAST_OPEN_PRODUCT = le.fit_transform(ds.LAST_OPEN_PRODUCT)
# ds.LAST_CLOSED_PRODUCT = le.fit_transform(ds.LAST_CLOSED_PRODUCT)
# ds.PD_MIX_EVER = le.fit_transform(ds.PD_MIX_EVER)
ds.CITY = le.fit_transform(ds.CITY)
ds.IS_SEGMENT_CD = le.fit_transform(ds.IS_SEGMENT_CD)
ds.REGION_CD = le.fit_transform(ds.REGION_CD)
ds.GEO_REGION_CD = le.fit_transform(ds.GEO_REGION_CD)
ds.GENDER_CD = le.fit_transform(ds.GENDER_CD)
ds.DC_MIX = le.fit_transform(ds.DC_MIX)
ds.CC_MIX = le.fit_transform(ds.CC_MIX)
ds.BizSeg = le.fit_transform(ds.BizSeg)
ds.MARITAL_STATUS = le.fit_transform(ds.MARITAL_STATUS)
ds.Education_level = le.fit_transform(ds.Education_level)
ds.RESIDENCE_STATUS = le.fit_transform(ds.RESIDENCE_STATUS)
ds.EMPLOYMENT_TYPE = le.fit_transform(ds.EMPLOYMENT_TYPE)
ds.cash_loan = le.fit_transform(ds.cash_loan)
ds.grad = le.fit_transform(ds.grad)
ds.broj_prijava = le.fit_transform(ds.broj_prijava)
ds.broj_obveznika = le.fit_transform(ds.broj_obveznika)

In [10]:
y = ds['cash_loan']
X = ds[ds.loc[:, ds.columns != 'cash_loan'].columns]

In [11]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

## LightGBM

In [13]:
estimator = lgb.LGBMClassifier(
    boosting_type='gbdt',
    max_bin=80,
    feature_fraction=0.7,
    bagging_fraction=0.75,
    bagging_freq=5,
    n_jobs=-1,
    tree_learner='voting'
)

param_grid = {
    'max_depth': [25, 50],
    'num_leaves': [25, 50],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators':  [250, 500]
}

clf = GridSearchCV(
    estimator,
    param_grid,
    scoring='f1_micro',
    verbose=4,
    n_jobs=-1,
    cv=3
)

clf.fit(X_train, y_train)

best_estimator = clf.best_estimator_
best_score = clf.best_score_
best_params = clf.best_params_

print("Best score: %.4f %%" % best_score)
print("Best params:", best_params)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25 
[CV] n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=50 
[CV] n_estimator

  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9947870376700623, total= 4.4min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9947870376700623, total= 5.2min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9946673959116702, total= 5.5min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9946673959116702, total= 6.4min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50 


  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9948723207876116, total= 6.4min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9948723207876116, total= 6.7min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9961543720516852, total=10.0min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9961543720516852, total=10.3min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9964791139673207, total=10.9min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9962567941749564, total=11.2min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9965816640459424, total=11.6min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9963936555684693, total=11.8min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=25, score=0.9964106245513281, total=12.7min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9964791139673207, total=10.3min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.988001640801258, total= 5.6min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9962567941749564, total=11.9min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.9880866919632174, total= 5.7min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.9883947494359746, total= 6.3min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9963936555684693, total=12.0min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9964106245513281, total=12.4min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25 


  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.1, max_depth=50, score=0.9965816640459424, total=12.7min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9901722841320845, total=10.3min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9972311478772133, total=23.4min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9968380392424967, total=23.8min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9898988172557599, total=11.6min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=25, score=0.9967866543602365, total=24.9min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.988001640801258, total= 5.8min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9897959183673469, total=12.4min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9968380392424967, total=21.3min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.9883947494359746, total= 6.3min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.9896253503794353, total=12.0min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.9898646338962194, total=12.4min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50 


  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.9880866919632174, total= 6.2min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=25, score=0.9894369808224798, total=13.3min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9967866543602365, total=21.3min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.1, max_depth=50, score=0.9972311478772133, total=24.7min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9901722841320845, total=11.4min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total= 5.7min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9898988172557599, total=11.4min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9897959183673469, total=11.3min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total= 5.8min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.9896253503794353, total=12.1min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613885755307148, total= 5.5min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.9898646338962194, total=11.4min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25 


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9916763519518699, total=23.0min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.01, max_depth=50, score=0.9894369808224798, total=12.4min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9917447186709509, total=23.7min
[CV] n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=25, score=0.9916931596759306, total=24.8min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=10.1min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613885755307148, total=10.3min
[CV] n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=11.1min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total= 5.6min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total= 6.0min
[CV] n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=11.2min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613885755307148, total= 5.8min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613885755307148, total=11.4min
[CV] n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50 


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=12.1min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9917447186709509, total=22.9min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9916763519518699, total=24.1min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.01, max_depth=50, score=0.9916931596759306, total=23.5min


[Parallel(n_jobs=-1)]: Done  60 out of  72 | elapsed: 54.1min remaining: 10.8min
  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total= 9.8min


  if diff:
  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total= 9.5min


  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total=10.2min


  if diff:
  if diff:


[CV]  n_estimators=250, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613885755307148, total=10.4min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total=10.2min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=25, learning_rate=0.001, max_depth=50, score=0.9613885755307148, total=10.1min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=19.7min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613557120393792, total=19.9min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=25, score=0.9613885755307148, total=18.8min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613885755307148, total=10.4min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total=11.6min


  if diff:
  if diff:


[CV]  n_estimators=500, num_leaves=50, learning_rate=0.001, max_depth=50, score=0.9613557120393792, total=11.5min


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 59.6min finished


Best score: 0.9970 %
Best params: {'n_estimators': 500, 'num_leaves': 50, 'learning_rate': 0.1, 'max_depth': 25}


### Cross validation score

In [14]:
%%time

scores = cross_val_score(best_estimator, X_train, y_train, cv=10, n_jobs=-1)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


CV accuracy: 0.998 +/- 0.000
CPU times: user 4.58 s, sys: 892 ms, total: 5.47 s
Wall time: 14min 35s


### Learning curve

In [None]:
%%time

title = "Learning Curves"
cv = 10
plot_learning_curve(best_estimator, title, X_train, y_train, cv=cv, n_jobs=-1);

### Test predictions

In [None]:
y_pred = best_estimator.predict(X_test)

In [None]:
score = best_estimator.score(X_test, y_test)
print("Score: %.4f %%" % score)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f %%" % (accuracy * 100.0))

### Plot confusion matrix

In [None]:
class_names = np.array(['NP', 'CL'])

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Cache Loan')

plt.show()

### Feature importance

In [None]:
feature_list = list(ds.loc[:, ds.columns != 'cash_loan'].columns)
importances = list(rf.best_estimator_.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];