# [LTFS DataScience Finhack an Online Hackathon](https://datahack.analyticsvidhya.com/contest/ltfs-datascience-finhack-an-online-hackathon/)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
from datetime import datetime
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from time import time
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper

import catboost as cgb
from sklearn.metrics import make_scorer
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import gc
import warnings
warnings.filterwarnings('ignore')

## Notebook  Content
1. [Load data](#0)
1. [Data cleaning and preprocessing](#1) <br>    
1. [CATBOOST](#2)
1. [LGBM](#3)

<a id =0>  </a> </br>
## <span style ="color:blue"> <span style='background:orange'>  **1.Load data** 

In [3]:
train = pd.read_csv('data/train.csv') # load train data

In [4]:
train.shape

(233154, 41)

In [5]:
train.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [6]:
test = pd.read_csv('data/test_bqCt9Pv.csv') # Load test data

In [7]:
test.shape

(112392, 40)

In [8]:
test.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,...,0,0,0,5605,0,1,0,0yrs 8mon,1yrs 0mon,1
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,...,0,0,0,0,0,0,0,2yrs 5mon,2yrs 5mon,0
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0


In [9]:
train.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES',
       'loan_default'],
      dtype='object')

<a id=1>  </a> <br>
## <span style ="color:blue"> <span style='background:orange'> 2. Data cleaning and Preprocessing

## <span style='background:yellow'> Variables processings </span>
* Date.of.Birth to Age in <span style='background:yellow'>years
* DisbursalDate to DisbursalTime in <span style='background:yellow'>days
* AVERAGE.ACCT.AGE  to AVERAGE.ACCT.AGE in <span style='background:yellow'>days
* CREDIT.HISTORY.LENGTH to CREDIT.HISTORY.LENGTH in <span style='background:yellow'>days
  

target = train['loan_default']
train = train.drop(['loan_default'],1)


In [10]:
vars = ['Date.of.Birth', 'DisbursalDate', 'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH']

In [11]:
# Date.of.Birth to Age in months
train['Date.of.Birth'] = pd.to_datetime(train['Date.of.Birth'])
test['Date.of.Birth'] = pd.to_datetime(test['Date.of.Birth'])
date = datetime.now()
def age(dob):
    agee = (date - dob).days
    return round(agee/365,0)


train['Age']= train['Date.of.Birth'].apply(age)
test['Age'] = test['Date.of.Birth'].apply(age)

In [12]:
# DisbursalDate to DisbursalTime in days
train['DisbursalDate'] = pd.to_datetime(train['DisbursalDate'])
test['DisbursalDate'] = pd.to_datetime(test['DisbursalDate'])
date = datetime.now()
def dtime(dob):
    time = (date - dob).days
    return time

train['DisbursalTime']= train['DisbursalDate'].apply(dtime)
test['DisbursalTime'] = test['DisbursalDate'].apply(dtime)

In [13]:
# AVERAGE.ACCT.AGE to AVERAGE.ACCT.AGE in days,and 
# CREDIT.HISTORY.LENGTH to CREDIT.HISTORY.LENGTH in days

def todays(value):
    A = [float(s) for s in re.findall(r'-?\d+\.?\d*', value)]
    return round(A[0]*365 + A[1]/12*365,0)

train['AVERAGE.ACCT.AGE'] = train['AVERAGE.ACCT.AGE'].apply(todays)
train['CREDIT.HISTORY.LENGTH'] = train['CREDIT.HISTORY.LENGTH'].apply(todays)
test['AVERAGE.ACCT.AGE'] = test['AVERAGE.ACCT.AGE'].apply(todays)
test['CREDIT.HISTORY.LENGTH'] = test['CREDIT.HISTORY.LENGTH'].apply(todays)

In [14]:
train = train.drop(['Date.of.Birth','DisbursalDate'],1)
train.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age,DisbursalTime
0,420825,50578,58400,89.55,67,22807,45,1441,Salaried,6,...,0,0,0,0,0.0,0.0,0,0,36.0,512
1,537409,47145,65550,73.23,67,22807,45,1502,Self employed,6,...,1991,0,0,1,700.0,700.0,0,1,34.0,310
2,417566,53278,61360,89.63,67,22807,45,1497,Self employed,6,...,0,0,0,0,0.0,0.0,0,0,34.0,571
3,624493,57513,66113,88.48,67,22807,45,1501,Self employed,6,...,31,0,0,0,243.0,456.0,1,1,26.0,280
4,539055,52378,60300,88.39,67,22807,45,1495,Self employed,6,...,0,0,0,0,0.0,0.0,1,1,42.0,310


In [15]:
test = test.drop(['Date.of.Birth','DisbursalDate'],1)
test.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,...,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age,DisbursalTime
0,655269,53478,63558,86.54,67,22807,45,1497,Salaried,6,...,0,0,0,0,0,0.0,0.0,0,46.0,509
1,723482,55513,63163,89.45,67,22807,45,1497,Self employed,6,...,0,5605,0,1,0,243.0,365.0,1,34.0,255
2,758529,65282,84320,79.93,78,23135,86,2071,Salaried,4,...,0,0,0,0,0,0.0,0.0,0,24.0,246
3,763449,46905,63896,76.58,78,17014,45,2070,Self employed,4,...,0,0,0,0,0,882.0,882.0,0,47.0,246
4,708663,51428,63896,86.08,78,17014,45,2069,Salaried,4,...,0,0,0,0,0,0.0,0.0,0,48.0,258


### <span style='background:yellow'>Remove unwanted variables </span><br>
`UniqueID` `MobileNo_Avl_Flag`

In [16]:
train = train.drop(['UniqueID', 'MobileNo_Avl_Flag' ], 1)
test = test.drop(['UniqueID', 'MobileNo_Avl_Flag' ], 1)

### <span style='background:yellow'> Check for null values

In [17]:
train[train.isnull().any(axis=1)]

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age,DisbursalTime
87,52428,67405,81.60,78,17014,45,2099,,4,1646,...,0,0,0,0,0.0,0.0,0,0,21.0,315
88,51653,63896,86.08,78,17014,45,2079,,4,1646,...,0,0,0,0,0.0,0.0,0,0,20.0,277
91,49488,63306,83.72,78,17014,45,2069,,4,1646,...,0,0,0,0,0.0,0.0,0,0,21.0,289
99,40884,59313,70.81,78,17014,45,2099,,4,1646,...,0,0,0,0,0.0,0.0,0,0,21.0,318
125,49683,62577,83.10,78,17014,45,2099,,4,1646,...,0,0,0,0,0.0,0.0,0,0,22.0,307
813,17850,97311,19.53,11,22976,51,5969,,3,1464,...,0,0,0,0,0.0,0.0,0,0,22.0,297
850,49303,68885,74.04,11,15893,86,5969,,3,1464,...,0,0,0,0,1916.0,1916.0,0,0,36.0,311
874,56013,80906,71.69,11,24654,49,5940,,3,1464,...,0,0,0,0,0.0,0.0,0,0,19.0,357
1249,51003,65606,79.26,20,23502,45,6188,,5,785,...,0,0,0,0,0.0,0.0,0,0,21.0,280
1275,45549,73104,63.61,20,14158,45,6207,,5,785,...,0,0,0,0,0.0,0.0,0,0,20.0,277


In [17]:
test[test.isnull().any(axis=1)]

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age,DisbursalTime
9,51653,67445,81.55,78,17014,45,2078,,4,1646,...,0,0,0,0,0,0.0,0.0,0,21.0,257
273,51803,70611,75.06,11,21475,45,5941,,3,1464,...,0,0,0,0,0,548.0,548.0,0,37.0,250
468,36127,68230,54.23,20,14004,86,6187,,5,785,...,0,269767,0,2,0,335.0,912.0,0,20.0,508
473,41694,59251,71.73,20,14158,45,6207,,5,785,...,0,0,0,0,0,0.0,0.0,0,19.0,244
481,51958,69129,76.67,20,14158,45,6183,,5,785,...,0,0,0,0,0,0.0,0.0,0,22.0,248
597,38439,66361,58.77,63,16309,45,7083,,10,998,...,0,0,0,0,0,0.0,0.0,0,20.0,254
605,47645,65000,74.62,63,16309,45,7093,,10,998,...,0,8128,0,1,0,213.0,335.0,0,22.0,233
610,61713,74740,83.62,63,16309,45,7108,,10,998,...,0,0,0,0,0,0.0,0.0,0,31.0,324
660,44705,80672,58.26,63,23473,86,7123,,10,703,...,0,3680,0,0,0,243.0,243.0,0,19.0,245
670,78579,102297,78.20,63,21879,86,7100,,10,413,...,0,0,0,0,0,0.0,0.0,0,19.0,255


### <span style='background:yellow'> Check value count of variables which has null values

In [18]:
train['Employment.Type'].value_counts()

Self employed    127635
Salaried          97858
Name: Employment.Type, dtype: int64

In [19]:
test['Employment.Type'].value_counts()

Self employed    59794
Salaried         49155
Name: Employment.Type, dtype: int64

### <span style='background:yellow'> Replace nan with mode of variable

In [20]:
train= train.fillna({"Employment.Type": "Self employed"})
test= test.fillna({"Employment.Type": "Self employed"})

In [21]:
train[train.isnull().any(axis=1)]

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age,DisbursalTime


In [22]:
test[test.isnull().any(axis=1)]

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age,DisbursalTime


In [23]:
train.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age,DisbursalTime
0,50578,58400,89.55,67,22807,45,1441,Salaried,6,1998,...,0,0,0,0,0.0,0.0,0,0,36.0,512
1,47145,65550,73.23,67,22807,45,1502,Self employed,6,1998,...,1991,0,0,1,700.0,700.0,0,1,34.0,310
2,53278,61360,89.63,67,22807,45,1497,Self employed,6,1998,...,0,0,0,0,0.0,0.0,0,0,34.0,571
3,57513,66113,88.48,67,22807,45,1501,Self employed,6,1998,...,31,0,0,0,243.0,456.0,1,1,26.0,280
4,52378,60300,88.39,67,22807,45,1495,Self employed,6,1998,...,0,0,0,0,0.0,0.0,1,1,42.0,310


In [24]:
test.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age,DisbursalTime
0,53478,63558,86.54,67,22807,45,1497,Salaried,6,1998,...,0,0,0,0,0,0.0,0.0,0,46.0,509
1,55513,63163,89.45,67,22807,45,1497,Self employed,6,1998,...,0,5605,0,1,0,243.0,365.0,1,34.0,255
2,65282,84320,79.93,78,23135,86,2071,Salaried,4,1646,...,0,0,0,0,0,0.0,0.0,0,24.0,246
3,46905,63896,76.58,78,17014,45,2070,Self employed,4,1646,...,0,0,0,0,0,882.0,882.0,0,47.0,246
4,51428,63896,86.08,78,17014,45,2069,Salaried,4,1646,...,0,0,0,0,0,0.0,0.0,0,48.0,258


## <span style='background:yellow'> Deal with categorical variable </span>  <br>
* Employment.Type

* PERFORM_CNS.SCORE.DESCRIPTION


In [25]:
def label_encoder(df, encoder_dict=None):
    # Label encode categoricals
    categorical_feats = df.columns[df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        df[feat] = encoder.fit_transform(df[feat].fillna('NULL'))
    return df, categorical_feats.tolist(), encoder_dict


In [26]:
train, categorical_feats, encoder_dict = label_encoder(train)
test, categorical_feats, encoder_dict = label_encoder(test)

In [27]:
train.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age,DisbursalTime
0,50578,58400,89.55,67,22807,45,1441,0,6,1998,...,0,0,0,0,0.0,0.0,0,0,36.0,512
1,47145,65550,73.23,67,22807,45,1502,1,6,1998,...,1991,0,0,1,700.0,700.0,0,1,34.0,310
2,53278,61360,89.63,67,22807,45,1497,1,6,1998,...,0,0,0,0,0.0,0.0,0,0,34.0,571
3,57513,66113,88.48,67,22807,45,1501,1,6,1998,...,31,0,0,0,243.0,456.0,1,1,26.0,280
4,52378,60300,88.39,67,22807,45,1495,1,6,1998,...,0,0,0,0,0.0,0.0,1,1,42.0,310


In [28]:
train.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Employment.Type', 'State_ID',
       'Employee_code_ID', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES',
       'loan_default', 'Age', 'DisbursalTime'],
      dtype='object')

In [29]:
test.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Employment.Type', 'State_ID',
       'Employee_code_ID', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'Age',
       'DisbursalTime'],
      dtype='object')

In [59]:
train_y = train[['loan_default']]
train_X = train.drop('loan_default', axis =1)

# Feature Engineering

In [31]:
cat_features = ['branch_id', 'supplier_id','manufacturer_id', 'Current_pincode_ID', 'Employment.Type', 'State_ID',\
'Employee_code_ID', 'Aadhar_flag', 'PAN_flag','VoterID_flag','Employment.Type', 'PERFORM_CNS.SCORE.DESCRIPTION']

In [32]:
def feature_eng(df):
    df['amount_divide'] = df['PRI.OVERDUE.ACCTS'] / df['disbursed_amount']
    df_cat = df[cat_features]
    df_num = df.drop(cat_features, axis=1)
    df['mean'] = df_num.mean(axis=1)
    df['min'] = df_num.min(axis=1)
    df['max'] = df_num.max(axis=1)
    df['std'] = df_num.std(axis=1)
    df['sum_by_len'] = df.sum(axis=1)/len(df)
    
    df= df.fillna(0)
    df = df.replace(np.inf, 0)
    return df

train_X = feature_eng(train_X)
test = feature_eng(test)



In [33]:
train_X.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Employment.Type,State_ID,Employee_code_ID,...,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age,DisbursalTime,amount_divide,mean,min,max,std,sum_by_len
0,50578,58400,89.55,67,22807,45,1441,0,6,1998,...,0.0,0,36.0,512,0.0,3914.841071,0.0,58400.0,14324.077226,0.911983
1,47145,65550,73.23,67,22807,45,1502,1,6,1998,...,700.0,0,34.0,310,2.1e-05,8753.758215,0.0,65550.0,19418.496253,1.566615
2,53278,61360,89.63,67,22807,45,1497,1,6,1998,...,0.0,0,34.0,571,0.0,4119.0225,0.0,61360.0,15066.362358,0.953503
3,57513,66113,88.48,67,22807,45,1501,1,6,1998,...,456.0,1,26.0,280,0.0,4466.41,0.0,66113.0,16239.589157,1.022138
4,52378,60300,88.39,67,22807,45,1495,1,6,1998,...,0.0,1,42.0,310,0.0,4039.978214,0.0,60300.0,14810.720205,0.93802


In [34]:
test_X = test


<a id =2>  </a> </br>

# <span style ="color:blue"> <span style='background:orange'>  3. CatBoost 

## Hyper Parameter tunning 

In [None]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

from sklearn.metrics import make_scorer
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

clf = cgb.CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                        
                         od_type = 'Iter',
                         verbose= False
                        )


# Defining your search space
search_spaces = {'iterations': Integer(10, 1000),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight':Real(0.01, 1.0, 'uniform')}

# Setting up BayesSearchCV
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=roc_auc,
                    cv=skf,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)


best_params = report_perf(opt, train_X, train_y,'CatBoost', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)])

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(train_X,train_y,train_size=0.98,random_state=1236)

tuned_model = cgb.CatBoostClassifier(**best_params, loss_function= 'Logloss', eval_metric='AUC',use_best_model=True,random_seed=77)

In [None]:
tuned_model.fit(X_train,y_train,cat_features= cat_features,eval_set=(X_valid,y_valid))

In [None]:
pred1 = tuned_model.predict_proba(test_X)
preds1= pred1[:,1]


submission = pd.read_csv('sample_submission_24jSKY6.csv')

submission['loan_default'] = preds

submission.to_csv('tunned.csv', index = False)

In [35]:


X_train,X_valid,y_train,y_valid = train_test_split(train_X,train_y,train_size=0.98,random_state=1777)

In [36]:
model = cgb.CatBoostClassifier(iterations=300, learning_rate=0.05, l2_leaf_reg=11.7, depth=9, \
                           rsm=0.4, loss_function= 'Logloss', eval_metric='AUC',use_best_model=True,random_seed=7777)

In [37]:
model.fit(X_train,y_train,cat_features= cat_features,eval_set=(X_valid,y_valid))

0:	test: 0.6220999	best: 0.6220999 (0)	total: 251ms	remaining: 1m 15s
1:	test: 0.6307948	best: 0.6307948 (1)	total: 467ms	remaining: 1m 9s
2:	test: 0.6349776	best: 0.6349776 (2)	total: 677ms	remaining: 1m 6s
3:	test: 0.6314208	best: 0.6349776 (2)	total: 882ms	remaining: 1m 5s
4:	test: 0.6298615	best: 0.6349776 (2)	total: 1.14s	remaining: 1m 7s
5:	test: 0.6321991	best: 0.6349776 (2)	total: 1.36s	remaining: 1m 6s
6:	test: 0.6300621	best: 0.6349776 (2)	total: 1.46s	remaining: 1m
7:	test: 0.6343518	best: 0.6349776 (2)	total: 1.7s	remaining: 1m 2s
8:	test: 0.6364143	best: 0.6364143 (8)	total: 1.9s	remaining: 1m 1s
9:	test: 0.6365265	best: 0.6365265 (9)	total: 2.12s	remaining: 1m 1s
10:	test: 0.6359972	best: 0.6365265 (9)	total: 2.31s	remaining: 1m
11:	test: 0.6366654	best: 0.6366654 (11)	total: 2.54s	remaining: 1m
12:	test: 0.6390988	best: 0.6390988 (12)	total: 2.75s	remaining: 1m
13:	test: 0.6392928	best: 0.6392928 (13)	total: 3.02s	remaining: 1m 1s
14:	test: 0.6398597	best: 0.6398597 (14)

<catboost.core.CatBoostClassifier at 0x1a224e99e8>

In [38]:
pred = model.predict_proba(test_X)
preds= pred[:,1]

In [39]:
submission = pd.read_csv('data/sample_submission_24jSKY6.csv')

submission['loan_default'] = preds

submission.to_csv('submission.csv', index = False)

<a id =3>  </a> </br>

# <span style ="color:blue"> <span style='background:orange'>  4. LightGBM 

In [41]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.01, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (110, 150),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (20, 50),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (15, 45)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO

opt_params = bayes_parameter_opt_lgb(train_X, train_y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=100, learning_rate=0.05)
    

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6671  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 32.71   [0m | [0m 34.38   [0m | [0m 0.04432 [0m | [0m 145.7   [0m |
| [95m 2       [0m | [95m 0.6681  [0m | [95m 0.9927  [0m | [95m 0.4068  [0m | [95m 3.959   [0m | [95m 1.587   [0m | [95m 37.04   [0m | [95m 42.77   [0m | [95m 0.008033[0m | [95m 113.5   [0m |
| [0m 3       [0m | [0m 0.6664  [0m | [0m 0.804   [0m | [0m 0.7661  [0m | [0m 3.891   [0m | [0m 2.61    [0m | [0m 49.36   [0m | [0m 38.97   [0m | [0m 0.04669 [0m | [0m 141.2   [0m |
| [0m 4       [0m | [0m 0.667   [0m | [0m 0.8237  [0m | [0m 0.6119  [0m | [0m 0.7168  [0m | [0m 2.834   [0m | [0m 35.66   [0m 

In [48]:
param = {
    'num_leaves': 110,
    'feature_fraction': 0.3209,
    'bagging_fraction': 0.9595 ,
    'bagging_freq': 5,
    'max_depth':50,
    'lambda_l1': 1.244,
    'lambda_l2':1.21,
    'min_split_gain': 0.07919,
    'min_child_weight': 44.9 ,
     'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbose': 1,
    'metric': 'auc',
    'is_unbalance': True,
    'boost_from_average': False,
}

In [49]:
nfold = 5
target = 'loan_default'
predictors = train_X.columns.values.tolist()

In [50]:
gc.collect()

1010

In [60]:
skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2021)

oof = np.zeros(len(train_X))
predictions = np.zeros(len(test_X))

i = 1
for train_index, valid_index in skf.split(train_y, train_y.values):
    print("fold {}".format(i))
    xg_train = lgb.Dataset(train_X.iloc[train_index][predictors].values,
                           label=train_y.iloc[train_index][target].values,
                           feature_name=predictors,
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(train_X.iloc[valid_index][predictors].values,
                           label=train_y.iloc[valid_index][target].values,
                           feature_name=predictors,
                           free_raw_data = False
                           )   

    
    clf = lgb.train(param, xg_train, 5000, valid_sets = [xg_valid], verbose_eval=10, early_stopping_rounds = 50)
    oof[valid_index] = clf.predict(train_X.iloc[valid_index][predictors].values, num_iteration=clf.best_iteration) 
    
    predictions += clf.predict(test_X[predictors], num_iteration=clf.best_iteration) / nfold
    i = i + 1

print("\n\nCV AUC: {:<0.2f}".format(metrics.roc_auc_score(train_y.values, oof)))

fold 1
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.65451
[20]	valid_0's auc: 0.659341
[30]	valid_0's auc: 0.664284
[40]	valid_0's auc: 0.66718
[50]	valid_0's auc: 0.669461
[60]	valid_0's auc: 0.670528
[70]	valid_0's auc: 0.67085
[80]	valid_0's auc: 0.671107
[90]	valid_0's auc: 0.671856
[100]	valid_0's auc: 0.672342
[110]	valid_0's auc: 0.672045
[120]	valid_0's auc: 0.671694
[130]	valid_0's auc: 0.671801
[140]	valid_0's auc: 0.671728
[150]	valid_0's auc: 0.671468
Early stopping, best iteration is:
[100]	valid_0's auc: 0.672342
fold 2
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.656203
[20]	valid_0's auc: 0.660324
[30]	valid_0's auc: 0.66494
[40]	valid_0's auc: 0.66715
[50]	valid_0's auc: 0.669442
[60]	valid_0's auc: 0.670318
[70]	valid_0's auc: 0.671273
[80]	valid_0's auc: 0.671713
[90]	valid_0's auc: 0.672238
[100]	valid_0's auc: 0.67248
[110]	valid_0's auc: 0.672443
[120]	valid_0's auc: 0.672408
[130]	valid

In [61]:
submission['loan_default'] = predictions

In [62]:
submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.540072
1,723482,0.547089
2,758529,0.506757
3,763449,0.508455
4,708663,0.578548
