# Setup

In [1]:
import os
import sys 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE 
import warnings
warnings.simplefilter('ignore', UserWarning)

import gc
gc.enable()

In [3]:
# notebook settings
pd.options.display.max_rows = 30
pd.options.display.float_format = '{:.4f}'.format

df = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_bal = pd.read_csv('bureau_balance.csv')
cc_bal = pd.read_csv('credit_card_balance.csv')
install_pmts = pd.read_csv('installments_payments.csv')
pos_cash_bal = pd.read_csv('POS_CASH_balance.csv')
prev_app = pd.read_csv('previous_application.csv')
print("app_train data holds {} obs and {} variables".format(df.shape[0], df.shape[1]))
print("app_test data holds {} obs and {} variables".format(test.shape[0], test.shape[1]))
print("bureau data holds {} obs and {} variables".format(bureau.shape[0], bureau.shape[1]))
print("bureau_bal data holds {} obs and {} variables".format(bureau_bal.shape[0], bureau_bal.shape[1]))
print("cc_bal data holds {} obs and {} variables".format(cc_bal.shape[0], cc_bal.shape[1]))
print("install_pmts data holds {} obs and {} variables".format(install_pmts.shape[0], install_pmts.shape[1]))
print("pos_cash_bal data holds {} obs and {} variables".format(pos_cash_bal.shape[0], pos_cash_bal.shape[1]))
print("prev_app data holds {} obs and {} variables".format(prev_app.shape[0], prev_app.shape[1]))

app_train data holds 307511 obs and 122 variables
app_test data holds 48744 obs and 121 variables
bureau data holds 1716428 obs and 17 variables
bureau_bal data holds 27299925 obs and 3 variables
cc_bal data holds 3840312 obs and 23 variables
install_pmts data holds 13605401 obs and 8 variables
pos_cash_bal data holds 10001358 obs and 8 variables
prev_app data holds 1670214 obs and 37 variables


In [3]:
int_features = prev_app.iloc[:, 2:].select_dtypes('int64').columns.values
float_features = prev_app.iloc[:, 2:].select_dtypes('float64').columns.values
str_features = prev_app.iloc[:, 2:].select_dtypes('O').columns.values

print('# of integer features:', len(int_features))
print('# of float features:', len(float_features))
print('# of str features:', len(str_features))

# of integer features: 4
# of float features: 15
# of str features: 16


In [4]:
for i in str_features:
    print(prev_app[i].value_counts(dropna=False) / len(bureau) * 100)

Cash loans        43.5528
Consumer loans    42.4807
Revolving loans   11.2538
XNA                0.0202
Name: NAME_CONTRACT_TYPE, dtype: float64
TUESDAY     14.8633
WEDNESDAY   14.8570
MONDAY      14.7724
FRIDAY      14.6844
THURSDAY    14.5126
SATURDAY    14.0193
SUNDAY       9.5985
Name: WEEKDAY_APPR_PROCESS_START, dtype: float64
Y   96.8138
N    0.4938
Name: FLAG_LAST_APPL_PER_CONTRACT, dtype: float64
XAP                                53.7547
XNA                                39.4959
Repairs                             1.3846
Other                               0.9093
Urgent needs                        0.4901
Buying a used car                   0.1683
Building a house or an annex        0.1569
Everyday expenses                   0.1408
Medicine                            0.1267
Payments on other loans             0.1125
Education                           0.0916
Journey                             0.0722
Purchase of electronic equipment    0.0618
Buying a new car                 

In [5]:
prev_app['APPROVED'] = prev_app['NAME_CONTRACT_STATUS']=='Approved'
prev_app['REFUSED'] = prev_app['NAME_CONTRACT_STATUS']=='Refused'
prev_app['HIGH_INTEREST'] = prev_app['NAME_YIELD_GROUP']=='high'
prev_app['LOW_INTEREST'] = prev_app['NAME_YIELD_GROUP'].isin(['low_normal','low_action'])
prev_app['NEW'] = prev_app['NAME_CLIENT_TYPE'].isin(['New'])
prev_app['REPEAT'] = prev_app['NAME_CLIENT_TYPE'].isin(['Repeater','Refreshed'])

In [6]:
prev_app2=prev_app
df2 = df[['SK_ID_CURR','TARGET']]
prev_app2 = prev_app2.merge(df2, on = ['SK_ID_CURR'], how = 'left')
correlations = prev_app2.corr()['TARGET'].sort_values()

print('Most Positive Correlations:\n', correlations.tail(10))
print('\nMost Negative Correlations:\n', correlations.head(10))

Most Positive Correlations:
 NEW                         0.0057
HIGH_INTEREST               0.0166
DAYS_TERMINATION            0.0170
DAYS_LAST_DUE               0.0175
DAYS_LAST_DUE_1ST_VERSION   0.0180
RATE_INTEREST_PRIVILEGED    0.0286
CNT_PAYMENT                 0.0305
DAYS_DECISION               0.0399
REFUSED                     0.0545
TARGET                      1.0000
Name: TARGET, dtype: float64

Most Negative Correlations:
 APPROVED                  -0.0492
LOW_INTEREST              -0.0344
DAYS_FIRST_DRAWING        -0.0312
HOUR_APPR_PROCESS_START   -0.0278
RATE_DOWN_PAYMENT         -0.0261
AMT_DOWN_PAYMENT          -0.0169
AMT_ANNUITY               -0.0149
DAYS_FIRST_DUE            -0.0067
REPEAT                    -0.0058
AMT_APPLICATION           -0.0056
Name: TARGET, dtype: float64


In [7]:
import gc
gc.collect

B = prev_app
B2 = prev_app.SK_ID_CURR

# Feature: Total # of Pre App
grp = B[['SK_ID_CURR', 'SK_ID_PREV']].groupby(by = ['SK_ID_CURR'])['SK_ID_PREV'].count().reset_index().rename(index=str, columns={'SK_ID_PREV': 'PRE_APP_COUNT'})
B2 = B2.to_frame().merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of Approved Loans
grp = B[['SK_ID_CURR', 'APPROVED']].groupby(by = ['SK_ID_CURR'])['APPROVED'].sum().reset_index().rename(index=str, columns={'APPROVED': 'APPROVED_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of REFUSED Loans
grp = B[['SK_ID_CURR', 'REFUSED']].groupby(by = ['SK_ID_CURR'])['REFUSED'].sum().reset_index().rename(index=str, columns={'REFUSED': 'REFUSED_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of LOW INTEREST Loans
grp = B[['SK_ID_CURR', 'LOW_INTEREST']].groupby(by = ['SK_ID_CURR'])['LOW_INTEREST'].sum().reset_index().rename(index=str, columns={'LOW_INTEREST': 'LOW_INTEREST_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of HIGH INTEREST Loans
grp = B[['SK_ID_CURR', 'HIGH_INTEREST']].groupby(by = ['SK_ID_CURR'])['HIGH_INTEREST'].sum().reset_index().rename(index=str, columns={'HIGH_INTEREST': 'HIGH_INTEREST_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of NEW Loans
grp = B[['SK_ID_CURR', 'NEW']].groupby(by = ['SK_ID_CURR'])['NEW'].sum().reset_index().rename(index=str, columns={'NEW': 'NEW_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of REPEATING Loans
grp = B[['SK_ID_CURR', 'REPEAT']].groupby(by = ['SK_ID_CURR'])['REPEAT'].sum().reset_index().rename(index=str, columns={'REPEAT': 'REPEAT_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of REJECT REASONS
grp = B[['SK_ID_CURR', 'CODE_REJECT_REASON']].groupby(by = ['SK_ID_CURR'])['CODE_REJECT_REASON'].nunique().reset_index().rename(index=str, columns={'CODE_REJECT_REASON': 'REJECT_REASON_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of Product Combinations
grp = B[['SK_ID_CURR', 'PRODUCT_COMBINATION']].groupby(by = ['SK_ID_CURR'])['PRODUCT_COMBINATION'].nunique().reset_index().rename(index=str, columns={'PRODUCT_COMBINATION': 'TOTAL_PRODUCT_COMBO'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

B2['APPROVED_RATIO'] = B2['APPROVED_LOAN_COUNT']/B2['PRE_APP_COUNT']
B2['REFUSED_RATIO'] = B2['REFUSED_LOAN_COUNT']/B2['PRE_APP_COUNT']
B2['APPROVE_REFUSE_RATIO'] = B2['APPROVED_LOAN_COUNT']/B2['REFUSED_LOAN_COUNT']

B2['LOW_INT_RATIO'] = B2['LOW_INTEREST_COUNT']/B2['PRE_APP_COUNT']
B2['HIGH_INT_RATIO'] = B2['HIGH_INTEREST_COUNT']/B2['PRE_APP_COUNT']
B2['LOW_HIGH_INT_RATIO'] = B2['LOW_INTEREST_COUNT']/B2['HIGH_INTEREST_COUNT']

B2['NEW_RATIO'] = B2['NEW_LOAN_COUNT']/B2['PRE_APP_COUNT']
B2['REPEAT_RATIO'] = B2['REPEAT_LOAN_COUNT']/B2['PRE_APP_COUNT']
B2['NEW_REPEAT_RATIO'] = B2['NEW_LOAN_COUNT']/B2['REPEAT_LOAN_COUNT']

gc.collect()
print(B2.shape)

(1670214, 19)


In [8]:
# Feature: Avg Payment CNT
grp = B[['SK_ID_CURR', 'CNT_PAYMENT']].groupby(by = ['SK_ID_CURR'])['CNT_PAYMENT'].mean().reset_index().rename(index=str, columns={'CNT_PAYMENT': 'AVG_PMT_CNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total Payment CNT
grp = B[['SK_ID_CURR', 'CNT_PAYMENT']].groupby(by = ['SK_ID_CURR'])['CNT_PAYMENT'].sum().reset_index().rename(index=str, columns={'CNT_PAYMENT': 'TOTAL_PMT_CNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Avg Days Of Previous APP Decision
grp = B[['SK_ID_CURR', 'DAYS_DECISION']].groupby(by = ['SK_ID_CURR'])['DAYS_DECISION'].mean().reset_index().rename(index=str, columns={'DAYS_DECISION': 'AVG_DAYS_OF_PRE_DEC'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: CLosest Day of Previous APP Decision
grp = B[['SK_ID_CURR', 'DAYS_DECISION']].groupby(by = ['SK_ID_CURR'])['DAYS_DECISION'].max().reset_index().rename(index=str, columns={'DAYS_DECISION': 'MIN_DAYS_OF_PRE_DEC'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Avg Interest Priviledged
grp = B[['SK_ID_CURR', 'RATE_INTEREST_PRIVILEGED']].groupby(by = ['SK_ID_CURR'])['RATE_INTEREST_PRIVILEGED'].mean().reset_index().rename(index=str, columns={'RATE_INTEREST_PRIVILEGED': 'AVG_INT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Avg Interest DOWN PMT RATE
grp = B[['SK_ID_CURR', 'RATE_DOWN_PAYMENT']].groupby(by = ['SK_ID_CURR'])['RATE_DOWN_PAYMENT'].mean().reset_index().rename(index=str, columns={'RATE_DOWN_PAYMENT': 'AVG_DOWN_PMT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: MAX Approved Credit
grp = B[['SK_ID_CURR', 'AMT_CREDIT', 'APPROVED']].groupby(by = ['SK_ID_CURR']).apply(lambda x: x[x['APPROVED']==1]['AMT_CREDIT'].max()).reset_index().rename(index=int, columns={0: 'MAX_APPROVED_CREDIT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

gc.collect()
print(B2.shape)

(1670214, 26)


In [9]:
B3 = B2.groupby('SK_ID_CURR').first().reset_index()
B3 = B3.replace([np.inf, -np.inf], np.nan)
print(B3.shape)
#B3.head().T

(338857, 26)


In [10]:
df2 = df[['SK_ID_CURR','TARGET']]
B4 = df2.merge(B3, on = ['SK_ID_CURR'], how = 'left')
correlations = B4.corr()['TARGET'].sort_values()

print('Most Positive Correlations:\n', correlations.tail(10))
print('\nMost Negative Correlations:\n', correlations.head(10))

Most Positive Correlations:
 AVG_PMT_CNT           0.0277
AVG_INT               0.0311
HIGH_INTEREST_COUNT   0.0369
HIGH_INT_RATIO        0.0376
NEW_LOAN_COUNT        0.0460
AVG_DAYS_OF_PRE_DEC   0.0469
REJECT_REASON_COUNT   0.0636
REFUSED_LOAN_COUNT    0.0645
REFUSED_RATIO         0.0777
TARGET                1.0000
Name: TARGET, dtype: float64

Most Negative Correlations:
 APPROVE_REFUSE_RATIO   -0.0812
APPROVED_RATIO         -0.0635
LOW_INT_RATIO          -0.0544
LOW_HIGH_INT_RATIO     -0.0431
LOW_INTEREST_COUNT     -0.0351
AVG_DOWN_PMT           -0.0336
APPROVED_LOAN_COUNT    -0.0316
MAX_APPROVED_CREDIT    -0.0276
REPEAT_RATIO           -0.0214
SK_ID_CURR             -0.0021
Name: TARGET, dtype: float64


In [11]:
df = df.merge(B3, on = ['SK_ID_CURR'], how = 'left')
print(df.shape)
#df.head()

(307511, 147)


In [12]:
int_features = bureau.iloc[:, 2:].select_dtypes('int64').columns.values
float_features = bureau.iloc[:, 2:].select_dtypes('float64').columns.values
str_features = bureau.iloc[:, 2:].select_dtypes('O').columns.values

print('# of integer features:', len(int_features))
print('# of float features:', len(float_features))
print('# of str features:', len(str_features))

# of integer features: 4
# of float features: 8
# of str features: 3


In [13]:
bureau.iloc[:, 2:].select_dtypes('O').describe()

Unnamed: 0,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
count,1716428,1716428,1716428
unique,4,4,15
top,Closed,currency 1,Consumer credit
freq,1079273,1715020,1251615


In [14]:
for i in str_features:
    print(bureau[i].value_counts(dropna=False) / len(bureau) * 100)

Closed     62.8790
Active     36.7395
Sold        0.3803
Bad debt    0.0012
Name: CREDIT_ACTIVE, dtype: float64
currency 1   99.9180
currency 2    0.0713
currency 3    0.0101
currency 4    0.0006
Name: CREDIT_CURRENCY, dtype: float64
Consumer credit                                72.9197
Credit card                                    23.4321
Car loan                                        1.6132
Mortgage                                        1.0715
Microloan                                       0.7232
Loan for business development                   0.1151
Another type of loan                            0.0593
Unknown type of loan                            0.0323
Loan for working capital replenishment          0.0273
Cash loan (non-earmarked)                       0.0033
Real estate loan                                0.0016
Loan for the purchase of equipment              0.0011
Loan for purchase of shares (margin lending)    0.0002
Interbank credit                                0.0

In [15]:
bureau['ACTIVE_LOAN'] = bureau['CREDIT_ACTIVE']=='Active'
bureau['NON_ACTIVE_LOAN'] = bureau['CREDIT_ACTIVE']!='Active'
#bureau=bureau.drop(['CREDIT_CURRENCY'],axis=1)
bureau['CONSUMER_CREDIT'] = bureau['CREDIT_TYPE'].isin(['Consumer credit','Credit card','Car loan','Mortgage'])
#bureau.loc[~bureau['CREDIT_TYPE'].isin(['Consumer credit']), 'CREDIT_TYPE'] = "Business credit"
bureau['BUSINESS_CREDIT'] = bureau['CONSUMER_CREDIT']==False

In [16]:
def plot_triaxes(df_in):
    fig, axes = plt.subplots(len(df_in.columns)//2, 2, figsize=(12, 12))
    i = 0
    for triaxis in axes:
        for axis in triaxis:
            df_in.hist(column = df_in.columns[i], bins = 60, ax=axis)
            i = i+1

#plot_triaxes(bureau.iloc[:, 2:].select_dtypes('float64'))
print(bureau.iloc[:, 2:].select_dtypes('float64').describe())

       DAYS_CREDIT_ENDDATE  DAYS_ENDDATE_FACT  AMT_CREDIT_MAX_OVERDUE  \
count         1610875.0000       1082775.0000             591940.0000   
mean              510.5174         -1017.4371               3825.4177   
std              4994.2198           714.0106             206031.6062   
min            -42060.0000        -42023.0000                  0.0000   
25%             -1138.0000         -1489.0000                  0.0000   
50%              -330.0000          -897.0000                  0.0000   
75%               474.0000          -425.0000                  0.0000   
max             31199.0000             0.0000          115987185.0000   

       AMT_CREDIT_SUM  AMT_CREDIT_SUM_DEBT  AMT_CREDIT_SUM_LIMIT  \
count    1716415.0000         1458759.0000          1124648.0000   
mean      354994.5919          137085.1200             6229.5150   
std      1149811.3440          677401.1310            45032.0315   
min            0.0000        -4705600.3200          -586406.1150   
25

Remove nagative values DAYS_CREDIT_ENDDATE because we don't need duration of nonactive loans and we already have  DAYS_ENDDATE_FACT for closed loans.

Remove negative amounts for all of these AMT data. 

In [17]:
#bureau.mask(bureau.sub(bureau.mean()).div(bureau.std()).abs().gt(3))
import warnings
warnings.filterwarnings('ignore')
bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < 0] = np.nan
bureau['AMT_CREDIT_SUM_OVERDUE'][bureau['AMT_CREDIT_SUM_OVERDUE'] < 0] = np.nan
bureau['AMT_ANNUITY'][bureau['AMT_ANNUITY'] < 0] = np.nan
bureau['AMT_CREDIT_SUM_LIMIT'][bureau['AMT_CREDIT_SUM_LIMIT'] < 0] = np.nan
bureau['AMT_CREDIT_SUM_DEBT'][bureau['AMT_CREDIT_SUM_DEBT'] < 0] = np.nan
bureau['AMT_CREDIT_SUM'][bureau['AMT_CREDIT_SUM'] < 0] = np.nan
bureau['AMT_CREDIT_MAX_OVERDUE'][bureau['AMT_CREDIT_MAX_OVERDUE'] < 0] = np.nan
print(bureau.iloc[:, 2:].select_dtypes('float64').describe())

       DAYS_CREDIT_ENDDATE  DAYS_ENDDATE_FACT  AMT_CREDIT_MAX_OVERDUE  \
count          603486.0000       1082775.0000             591940.0000   
mean             3106.7260         -1017.4371               3825.4177   
std              7384.7634           714.0106             206031.6062   
min                 0.0000        -42023.0000                  0.0000   
25%               323.0000         -1489.0000                  0.0000   
50%               790.0000          -897.0000                  0.0000   
75%              1386.0000          -425.0000                  0.0000   
max             31199.0000             0.0000          115987185.0000   

       AMT_CREDIT_SUM  AMT_CREDIT_SUM_DEBT  AMT_CREDIT_SUM_LIMIT  \
count    1716415.0000         1450341.0000          1124297.0000   
mean      354994.5919          137926.1187             6238.8775   
std      1149811.3440          679233.8310            45022.3336   
min            0.0000               0.0000                0.0000   
25

In [18]:
print(bureau.iloc[:, 2:].select_dtypes('int64').describe())
def plot_triaxes(df_in):
    fig, axes = plt.subplots(len(df_in.columns)//2, 2, figsize=(12, 12))
    i = 0
    for triaxis in axes:
        for axis in triaxis:
            df_in.hist(column = df_in.columns[i], bins = 60, ax=axis)
            i = i+1
#plot_triaxes(bureau.iloc[:, 2:].select_dtypes('int64'))

       DAYS_CREDIT  CREDIT_DAY_OVERDUE  CNT_CREDIT_PROLONG  DAYS_CREDIT_UPDATE
count 1716428.0000        1716428.0000        1716428.0000        1716428.0000
mean    -1142.1077              0.8182              0.0064           -593.7483
std       795.1649             36.5444              0.0962            720.7473
min     -2922.0000              0.0000              0.0000         -41947.0000
25%     -1666.0000              0.0000              0.0000           -908.0000
50%      -987.0000              0.0000              0.0000           -395.0000
75%      -474.0000              0.0000              0.0000            -33.0000
max         0.0000           2792.0000              9.0000            372.0000


Removed positive values for DAYS_CREDIT_UPDATE

In [19]:
bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] > 0] = np.nan
print(bureau.iloc[:, 2:].select_dtypes('int64').describe())

       DAYS_CREDIT  CREDIT_DAY_OVERDUE  CNT_CREDIT_PROLONG
count 1716428.0000        1716428.0000        1716428.0000
mean    -1142.1077              0.8182              0.0064
std       795.1649             36.5444              0.0962
min     -2922.0000              0.0000              0.0000
25%     -1666.0000              0.0000              0.0000
50%      -987.0000              0.0000              0.0000
75%      -474.0000              0.0000              0.0000
max         0.0000           2792.0000              9.0000


In [20]:
import gc
gc.collect

B = bureau
B2 = bureau.SK_ID_CURR

# Feature: Total # of Bureau Loans
grp = B[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby(by = ['SK_ID_CURR'])['SK_ID_BUREAU'].count().reset_index().rename(index=str, columns={'SK_ID_BUREAU': 'BUREAU_LOAN_COUNT'})
B2 = B2.to_frame().merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of Active Bureau Loans
grp = B[['SK_ID_CURR', 'ACTIVE_LOAN']].groupby(by = ['SK_ID_CURR'])['ACTIVE_LOAN'].sum().reset_index().rename(index=str, columns={'ACTIVE_LOAN': 'ACTIVE_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of Business Bureau Loans
grp = B[['SK_ID_CURR', 'BUSINESS_CREDIT']].groupby(by = ['SK_ID_CURR'])['BUSINESS_CREDIT'].sum().reset_index().rename(index=str, columns={'BUSINESS_CREDIT': 'BUSINESS_LOAN_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Total # of Loan Types
grp = B[['SK_ID_CURR', 'CREDIT_TYPE']].groupby(by = ['SK_ID_CURR'])['CREDIT_TYPE'].nunique().reset_index().rename(index=str, columns={'CREDIT_TYPE': 'BUREAU_LOAN_TYPES_COUNT'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Avg # Loans Per Loan Types
B2['AVERAGE_LOAN_TYPE'] = B2['BUREAU_LOAN_COUNT']/B2['BUREAU_LOAN_TYPES_COUNT']

# Feature: % Active Loans
B2['ACTIVE_PERCENT'] = B2['ACTIVE_LOAN_COUNT']/B2['BUREAU_LOAN_COUNT']

# Feature: % Business Loans
B2['BUSINESS_PERCENT'] = B2['BUSINESS_LOAN_COUNT']/B2['BUREAU_LOAN_COUNT']

# Feature: Avg Duration of Active Loans
grp = B[['SK_ID_CURR', 'DAYS_CREDIT_ENDDATE']].groupby(by = ['SK_ID_CURR'])['DAYS_CREDIT_ENDDATE'].mean().reset_index().rename(index=str, columns={'DAYS_CREDIT_ENDDATE': 'AVG_ACTIVE_DURATION'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature: Avg Days Past of Closed Loans
grp = B[['SK_ID_CURR', 'DAYS_ENDDATE_FACT']].groupby(by = ['SK_ID_CURR'])['DAYS_ENDDATE_FACT'].mean().reset_index().rename(index=str, columns={'DAYS_ENDDATE_FACT': 'AVG_DAYS_PAST_CLOSED_LOAN'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

# Feature : Debt Over Credit Limit Ratio
B['AMT_CREDIT_SUM_DEBT'] = B['AMT_CREDIT_SUM_DEBT'].fillna(0)
B['AMT_CREDIT_SUM'] = B['AMT_CREDIT_SUM'].fillna(0)

grp1 = B[['SK_ID_CURR', 'AMT_CREDIT_SUM_DEBT']].groupby(by = ['SK_ID_CURR'])['AMT_CREDIT_SUM_DEBT'].sum().reset_index().rename( index = str, columns = { 'AMT_CREDIT_SUM_DEBT': 'TOTAL_CUSTOMER_DEBT'})
grp2 = B[['SK_ID_CURR', 'AMT_CREDIT_SUM']].groupby(by = ['SK_ID_CURR'])['AMT_CREDIT_SUM'].sum().reset_index().rename( index = str, columns = { 'AMT_CREDIT_SUM': 'TOTAL_CUSTOMER_CREDIT'})

B2 = B2.merge(grp1, on = ['SK_ID_CURR'], how = 'left')
B2 = B2.merge(grp2, on = ['SK_ID_CURR'], how = 'left')

gc.collect()
B2['TOTAL_DEBT_CREDIT_RATIO'] = B2['TOTAL_CUSTOMER_DEBT']/B2['TOTAL_CUSTOMER_CREDIT']

# Feature : Overdue Over Debt Ratio

B['AMT_CREDIT_SUM_OVERDUE'] = B['AMT_CREDIT_SUM_OVERDUE'].fillna(0)

grp1 = B[['SK_ID_CURR', 'AMT_CREDIT_SUM_OVERDUE']].groupby(by = ['SK_ID_CURR'])['AMT_CREDIT_SUM_OVERDUE'].sum().reset_index().rename( index = str, columns = { 'AMT_CREDIT_SUM_OVERDUE': 'TOTAL_CUSTOMER_OVERDUE'})

B2 = B2.merge(grp1, on = ['SK_ID_CURR'], how = 'left')

B2['TOTAL_OVERDUE_DEBT_RATIO'] = B2['TOTAL_CUSTOMER_OVERDUE']/B2['TOTAL_CUSTOMER_DEBT']

#B2['DEBT_CREDIT_RATIO'][B2['DEBT_CREDIT_RATIO'] > 10] = np.nan

# Feature: Total # of Credits Prolonged
grp = B[['SK_ID_CURR', 'CNT_CREDIT_PROLONG']].groupby(by = ['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].count().reset_index().rename(index=str, columns={'CNT_CREDIT_PROLONG': 'TOTAL_CREDIT_PROLONG'})
B2 = B2.merge(grp, on = ['SK_ID_CURR'], how = 'left')

gc.collect()
print(B2.shape)

(1716428, 16)


In [21]:
# Feature: Total # of Bureau Loans
grp1 = B[['SK_ID_CURR', 'CNT_CREDIT_PROLONG', 'ACTIVE_LOAN']].groupby(by = ['SK_ID_CURR']).apply(lambda x: x[x['ACTIVE_LOAN']==1]['CNT_CREDIT_PROLONG'].count()).reset_index().rename(index=int, columns={0: 'ACTIVE_CREDIT_PROLONG'})
grp2 = B[['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE']].groupby(by = ['SK_ID_CURR']).apply(lambda x: x[x['CREDIT_ACTIVE']=='Bad debt']['SK_ID_BUREAU'].count()).reset_index().rename(index=int, columns={0: 'TOTAL_BAD_DEBT'})
grp3 = B[['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE']].groupby(by = ['SK_ID_CURR']).apply(lambda x: x[x['CREDIT_ACTIVE']=='Sold']['SK_ID_BUREAU'].count()).reset_index().rename(index=int, columns={0: 'TOTAL_SOLD_DEBT'})

In [22]:
B2 = B2.merge(grp1, on = ['SK_ID_CURR'], how = 'left')
B2 = B2.merge(grp2, on = ['SK_ID_CURR'], how = 'left')
B2 = B2.merge(grp3, on = ['SK_ID_CURR'], how = 'left')

B2['BAD_PERCENT'] = B2['TOTAL_BAD_DEBT']/B2['BUREAU_LOAN_COUNT']
B2['SOLD_PERCENT'] = B2['TOTAL_SOLD_DEBT']/B2['BUREAU_LOAN_COUNT']
gc.collect()
print(B2.shape)

(1716428, 21)


In [23]:
B3 = B2.groupby('SK_ID_CURR').first().reset_index()
B3 = B3.replace([np.inf, -np.inf], np.nan)
print(B3.shape)
#B3.head()

(305811, 21)


In [24]:
B4 = df[['SK_ID_CURR','TARGET']]
B4 = B4.merge(B3, on = ['SK_ID_CURR'], how = 'left')
correlations = B4.corr()['TARGET'].sort_values(ascending=False)
correlations

TARGET                       1.0000
TOTAL_DEBT_CREDIT_RATIO      0.0920
ACTIVE_PERCENT               0.0774
ACTIVE_LOAN_COUNT            0.0671
ACTIVE_CREDIT_PROLONG        0.0671
AVG_DAYS_PAST_CLOSED_LOAN    0.0532
BUSINESS_PERCENT             0.0348
BUSINESS_LOAN_COUNT          0.0321
AVG_ACTIVE_DURATION          0.0170
SOLD_PERCENT                 0.0165
TOTAL_CUSTOMER_OVERDUE       0.0133
TOTAL_SOLD_DEBT              0.0121
TOTAL_CUSTOMER_DEBT          0.0071
BAD_PERCENT                  0.0046
BUREAU_LOAN_TYPES_COUNT      0.0046
TOTAL_CREDIT_PROLONG         0.0041
BUREAU_LOAN_COUNT            0.0041
TOTAL_BAD_DEBT               0.0040
TOTAL_OVERDUE_DEBT_RATIO     0.0009
SK_ID_CURR                  -0.0021
AVERAGE_LOAN_TYPE           -0.0061
TOTAL_CUSTOMER_CREDIT       -0.0141
Name: TARGET, dtype: float64

In [25]:
#B5 = B2[['SK_ID_CURR','ACTIVE_LOAN_COUNT','BUSINESS_LOAN_COUNT','ACTIVE_PERCENT','BUSINESS_PERCENT',
#          'AVG_ACTIVE_DURATION','AVG_DAYS_PAST_CLOSED_LOAN','DEBT_CREDIT_RATIO'
#         ]]
df = df.merge(B3, on = ['SK_ID_CURR'], how = 'left')
print(df.shape)
#df.head()

(307511, 167)


In [26]:
df['DAYS_EMPLOYED_ANOM'] = df['DAYS_EMPLOYED'] == 365243
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace({365243: np.nan})
df['DAYS_EMPLOYED_ANOM'] = df['DAYS_EMPLOYED_ANOM'].astype(int)
df['APP_CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['CURRENT_CREDIT_INCOME_RATIO'] = df['TOTAL_CUSTOMER_CREDIT'] / df['AMT_INCOME_TOTAL']
df['CURRENT_DEBT_INCOME_RATIO'] = df['TOTAL_CUSTOMER_DEBT'] / df['AMT_INCOME_TOTAL']
df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['DAYS_EMPLOYED_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['APP_CREDIT_MAX_APPROVED_RATIO'] = df['AMT_CREDIT'] / df['MAX_APPROVED_CREDIT']
df = df.replace([np.inf, -np.inf], np.nan)

In [27]:
correlations = df.corr()['TARGET'].sort_values()

print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

Most Positive Correlations:
 DAYS_ID_PUBLISH               0.0515
AVG_DAYS_PAST_CLOSED_LOAN     0.0532
DAYS_LAST_PHONE_CHANGE        0.0552
REGION_RATING_CLIENT          0.0589
REGION_RATING_CLIENT_W_CITY   0.0609
REJECT_REASON_COUNT           0.0636
REFUSED_LOAN_COUNT            0.0645
ACTIVE_CREDIT_PROLONG         0.0671
ACTIVE_LOAN_COUNT             0.0671
DAYS_EMPLOYED                 0.0750
ACTIVE_PERCENT                0.0774
REFUSED_RATIO                 0.0777
DAYS_BIRTH                    0.0782
TOTAL_DEBT_CREDIT_RATIO       0.0920
TARGET                        1.0000
Name: TARGET, dtype: float64

Most Negative Correlations:
 EXT_SOURCE_3                 -0.1789
EXT_SOURCE_2                 -0.1605
EXT_SOURCE_1                 -0.1553
APPROVE_REFUSE_RATIO         -0.0812
DAYS_EMPLOYED_RATIO          -0.0680
APPROVED_RATIO               -0.0635
LOW_INT_RATIO                -0.0544
DAYS_EMPLOYED_ANOM           -0.0460
FLOORSMAX_AVG                -0.0440
FLOORSMAX_MEDI          

In [28]:
df.iloc[:, 2:].select_dtypes('object').describe().T.sort_values(by='unique', ascending=False)

Unnamed: 0,count,unique,top,freq
ORGANIZATION_TYPE,307511,58,Business Entity Type 3,67992
OCCUPATION_TYPE,211120,18,Laborers,55186
NAME_INCOME_TYPE,307511,8,Working,158774
NAME_TYPE_SUITE,306219,7,Unaccompanied,248526
WEEKDAY_APPR_PROCESS_START,307511,7,TUESDAY,53901
WALLSMATERIAL_MODE,151170,7,Panel,66040
NAME_FAMILY_STATUS,307511,6,Married,196432
NAME_HOUSING_TYPE,307511,6,House / apartment,272868
NAME_EDUCATION_TYPE,307511,5,Secondary / secondary special,218391
FONDKAPREMONT_MODE,97216,4,reg oper account,73830


In [29]:
# Dropping feature with > 10 unique items
# df2=df.drop(['ORGANIZATION_TYPE'], axis=1)
# df2=df.drop(['OCCUPATION_TYPE'], axis=1)

In [30]:
# Look at the share of records missing by feature, descending
missing=df.isnull().mean().sort_values(ascending = False)

print('\nMost Missing:\n', missing.head(20))


Most Missing:
 AVG_INT                    0.9850
COMMONAREA_AVG             0.6987
COMMONAREA_MEDI            0.6987
COMMONAREA_MODE            0.6987
NONLIVINGAPARTMENTS_MEDI   0.6943
NONLIVINGAPARTMENTS_AVG    0.6943
NONLIVINGAPARTMENTS_MODE   0.6943
FONDKAPREMONT_MODE         0.6839
LIVINGAPARTMENTS_MODE      0.6835
LIVINGAPARTMENTS_MEDI      0.6835
LIVINGAPARTMENTS_AVG       0.6835
FLOORSMIN_MODE             0.6785
FLOORSMIN_AVG              0.6785
FLOORSMIN_MEDI             0.6785
APPROVE_REFUSE_RATIO       0.6739
YEARS_BUILD_MODE           0.6650
YEARS_BUILD_AVG            0.6650
YEARS_BUILD_MEDI           0.6650
OWN_CAR_AGE                0.6599
LANDAREA_MODE              0.5938
dtype: float64


In [31]:
#Remove any feature with more than x% records missing
# missing_records_thresh = 1-.7

# df2 = df.copy(deep=True)
# df2.dropna(thresh= missing_records_thresh * df2.shape[0],
#           inplace = True,
#           axis = 1)

# print("\n\nOriginal dataset had %3.0f features. \n"
#       "After removing features with missing data for %3.0f percent\n"
#       "or more records . %3.0f features remain."%(df.shape[1], (1-missing_records_thresh)*100, df2.shape[1]))

In [6]:
new_features = pd.read_csv('feature_matrix_spec.csv')

In [16]:
df_key=df[['SK_ID_CURR','TARGET']]
df2=df_key.merge(new_features.drop(['TARGET'], axis = 1), on = ['SK_ID_CURR'], how = 'left')

# Process and Evaluate

In [17]:
# Divide features into numerical and categorical
gc.collect()
int_features = df2.iloc[:, 2:].select_dtypes('int64').columns.values
float_features = df2.iloc[:, 2:].select_dtypes('float64').columns.values
str_features = df2.iloc[:, 2:].select_dtypes('O').columns.values
print(df2.shape)
print('# of integer features:', len(int_features))
print('# of float features:', len(float_features))
print('# of str features:', len(str_features))
num_features = np.hstack([int_features, float_features])
cat_features = str_features

(307511, 885)
# of integer features: 39
# of float features: 802
# of str features: 42


In [18]:
# from sklearn.model_selection import train_test_split


# positives = df.TARGET == 1
# negatives = ~positives
# neg_df = df.loc[negatives]
# balanced_df = pd.concat([neg_df.sample(sum(positives)),df.loc[positives]])
# y = balanced_df.TARGET
# X = balanced_df.drop('TARGET',axis=1)

# X_train_raw, X_test_raw, y_train, y_test = train_test_split(
#     X, y, test_size=0.10, random_state=42, stratify=y)
# X_train_raw, X_dev_raw, y_train, y_dev = train_test_split(
#     X_train_raw, y_train, test_size=1/9., random_state=42, 
#     stratify=y_train)

In [19]:
from sklearn.model_selection import train_test_split

X = df2.drop(['TARGET','SK_ID_CURR'], axis = 1)
y = df2.TARGET

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y)


X_train_raw, X_dev_raw, y_train, y_dev = train_test_split(
    X_train_raw, y_train,
    test_size=1/9.,
    random_state=42,
    stratify=y_train
)


print('\nBalance of positive and negative classes of train:')
print(y_train.value_counts(normalize=True) * 100)

print('\nX_train shape:', X_train_raw.shape)
print('X_dev shape:', X_dev_raw.shape)
print('X_test shape:', X_test_raw.shape)


Balance of positive and negative classes of train:
0   91.9271
1    8.0729
Name: TARGET, dtype: float64

X_train shape: (246008, 883)
X_dev shape: (30751, 883)
X_test shape: (30752, 883)


In [20]:
# Using pipeline to feature scale numerical values, and hot encode categorical values
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

num_transformer = Pipeline(
    steps = [
        #('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler()), 
        ('imputer2', IterativeImputer(initial_strategy='median',n_nearest_features=30))
    ]
)

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ], 
    sparse_threshold=0.9
)


In [21]:
# Tranform the training data, and use the information from the training data to transform test data so we don't contaminate it. 
preprocessor.fit(X_train_raw)
X_train = preprocessor.transform(X_train_raw)
X_dev = preprocessor.transform(X_dev_raw)
X_test = preprocessor.transform(X_test_raw)
gc.collect
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_dev shape:', X_dev.shape)
print('y_dev shape:', y_dev.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

X_train shape: (246008, 1201)
y_train shape: (246008,)
X_dev shape: (30751, 1201)
y_dev shape: (30751,)
X_test shape:  (30752, 1201)
y_test shape:  (30752,)


## Evaluated different hyperparameters

In [25]:
%%time 
import time
from lightgbm import LGBMClassifier as LGBM
from sklearn.metrics import roc_auc_score

best_AUC = 0.5
best_n_estimators = 100
best_num_leaves = 10
best_reg_lambda = 0.5

lambda_values = [i*0.1 for i in range(1,11)]

# Creating loops to iterate through all possible values of hyperparameters:

for n_estimators in [500,600,700,800,900]:
    for num_leaves in [10,15,20,25,30,40]:
        for reg_lambda in lambda_values:
            start_time = time.time()

            clf = LGBM(n_estimators=n_estimators, boosting_type='gbdt', metric='binary_logloss', objective = 'binary',
                       class_weight = 'balanced', num_leaves=num_leaves, max_depth =-1,
                       learning_rate = 0.05, reg_alpha = 0.1, reg_lambda = reg_lambda, 
                       subsample = 0.8, n_jobs = -1, random_state = 50)
            
            clf.fit(X_train, y_train)
            #pred = clf.predict(X_dev)
            AUC = roc_auc_score(y_dev,  clf.predict_proba(X_dev)[:,1])

            end_time = time.time()

            if AUC > best_AUC:
                best_AUC = AUC
                best_n_estimators = n_estimators
                best_num_leaves = num_leaves
                best_reg_lambda = reg_lambda

            gc.collect()
            print("AUC = ", round(AUC,5), 
                  ",","n_estimators = ", n_estimators, 
                  ",","num_leaves = ", num_leaves, 
                  ",","reg_lambda = ", reg_lambda, 
                  ",", "time = ", round(end_time-start_time,0))

print("\n", "The Best AUC = ", best_AUC, 
      "\n", "The Best n_estimators = ", best_n_estimators, 
      "\n", "The Best num_leaves = ", best_num_leaves,
      "\n", "The Best reg_lambda =:", best_reg_lambda)

AUC =  0.78723 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.1 , time =  110.0
AUC =  0.78701 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.2 , time =  91.0
AUC =  0.7868 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.30000000000000004 , time =  104.0
AUC =  0.7868 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.4 , time =  89.0
AUC =  0.78685 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.5 , time =  89.0
AUC =  0.78709 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.6000000000000001 , time =  89.0
AUC =  0.78704 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.7000000000000001 , time =  109.0
AUC =  0.78703 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.8 , time =  148.0
AUC =  0.78698 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  0.9 , time =  175.0
AUC =  0.78704 , n_estimators =  500 , num_leaves =  10 , reg_lambda =  1.0 , time =  161.0
AUC =  0.78718 , n_estimators =  500 , n

AUC =  0.78709 , n_estimators =  600 , num_leaves =  20 , reg_lambda =  0.7000000000000001 , time =  125.0
AUC =  0.78755 , n_estimators =  600 , num_leaves =  20 , reg_lambda =  0.8 , time =  125.0
AUC =  0.78873 , n_estimators =  600 , num_leaves =  20 , reg_lambda =  0.9 , time =  126.0
AUC =  0.7876 , n_estimators =  600 , num_leaves =  20 , reg_lambda =  1.0 , time =  129.0
AUC =  0.78697 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.1 , time =  135.0
AUC =  0.78724 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.2 , time =  135.0
AUC =  0.78713 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.30000000000000004 , time =  137.0
AUC =  0.7872 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.4 , time =  136.0
AUC =  0.78771 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.5 , time =  136.0
AUC =  0.78763 , n_estimators =  600 , num_leaves =  25 , reg_lambda =  0.6000000000000001 , time =  136.0
AUC =  0.78814 , n_estimators =  600

AUC =  0.78602 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.2 , time =  190.0
AUC =  0.78473 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.30000000000000004 , time =  185.0
AUC =  0.78412 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.4 , time =  189.0
AUC =  0.7864 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.5 , time =  193.0
AUC =  0.78524 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.6000000000000001 , time =  206.0
AUC =  0.78475 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.7000000000000001 , time =  186.0
AUC =  0.78544 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.8 , time =  186.0
AUC =  0.78473 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  0.9 , time =  186.0
AUC =  0.78532 , n_estimators =  700 , num_leaves =  40 , reg_lambda =  1.0 , time =  190.0
AUC =  0.78784 , n_estimators =  800 , num_leaves =  10 , reg_lambda =  0.1 , time =  120.0
AUC =  0.78737 , n_estimators =  80

AUC =  0.78708 , n_estimators =  900 , num_leaves =  15 , reg_lambda =  0.7000000000000001 , time =  155.0
AUC =  0.7865 , n_estimators =  900 , num_leaves =  15 , reg_lambda =  0.8 , time =  172.0
AUC =  0.78667 , n_estimators =  900 , num_leaves =  15 , reg_lambda =  0.9 , time =  166.0
AUC =  0.78757 , n_estimators =  900 , num_leaves =  15 , reg_lambda =  1.0 , time =  183.0
AUC =  0.78614 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.1 , time =  193.0
AUC =  0.78598 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.2 , time =  196.0
AUC =  0.78573 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.30000000000000004 , time =  184.0
AUC =  0.78559 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.4 , time =  175.0
AUC =  0.78679 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.5 , time =  188.0
AUC =  0.78673 , n_estimators =  900 , num_leaves =  20 , reg_lambda =  0.6000000000000001 , time =  182.0
AUC =  0.78673 , n_estimators =  90

# Result based on best hyper parameter

In [None]:
%%time 
from lightgbm import LGBMClassifier as LGBM
X, y = X_train, y_train
X2,y2 = X_dev, y_dev

mod = LGBM(n_estimators=600, boosting_type='gbdt', metric='binary_logloss', objective = 'binary',
                       class_weight = 'balanced', num_leaves=20, max_depth =-1,
                       learning_rate = 0.05, reg_alpha = 0.1, reg_lambda = 0.3, 
                       subsample = 0.8, n_jobs = -1, random_state = 50)
mod.fit(X, y)

gc.collect()
print('Dev AUC: %.4f' %  
      roc_auc_score(y2, mod.predict_proba(X2)[:,1]))

In [None]:
%%time 
from lightgbm import LGBMClassifier as LGBM
X, y = X_train, y_train
X2,y2 = X_test, y_test

mod = LGBM(n_estimators=600, boosting_type='gbdt', metric='binary_logloss', objective = 'binary',
                       class_weight = 'balanced', num_leaves=20, max_depth =-1,
                       learning_rate = 0.05, reg_alpha = 0.1, reg_lambda = 0.3, 
                       subsample = 0.8, n_jobs = -1, random_state = 50)
mod.fit(X, y)

gc.collect()
print('Dev AUC: %.4f' %  
      roc_auc_score(y2, mod.predict_proba(X2)[:,1]))