In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from collections import Counter
import math
import random
from tqdm import tqdm

#%matplotlib inline

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing
from category_encoders import target_encoder, TargetEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler, SMOTE
import catboost as cb

pd.set_option('display.max_columns', 200)

Using TensorFlow backend.


In [3]:
def label_encoding_func(df_name, df_col_name):
    '''
    usage: dataframe[column_name] = label_encoding_function(dataframe, column_name)
    '''
    le = preprocessing.LabelEncoder()
    le.fit(df_name[df_col_name])
    return le.transform(df_name[df_col_name])

def do_one_hot_encoding(df_name, df_column_name, suffix=''):
    '''
    usage: dataframe[column_name] = do_one_hot_encoding(dataframe, column_name, suffix_for_column_name)
    '''
    x = pd.get_dummies(df_name[df_column_name])
    df_name = df_name.join(x, lsuffix=suffix)
    df_name = df_name.drop(df_column_name, axis=1) 
    return df_name

#function for perform target encoding later on
def perform_target_encoding(columns, X, Y, X_Val, X_Test):
    for i in tqdm(columns):
        target_enc = TargetEncoder(cols=[i], smoothing=3)
        target_enc_fit = target_enc.fit(X, Y)
        X[i] = target_enc.transform(X, Y)[i]
        X_Val[i] = target_enc.transform(X_Val)[i]
        X_Test[i] = target_enc.transform(X_Test)[i]
        
    return X, X_Val, X_Test

### Load Feature Engineered Datasets

In [3]:
path = '../../feature_engineering_eda_data/' 
train_file = 'train_feature_engineered_V2.csv'
test_file = 'test_feature_engineered_V2.csv'

train_df = pd.read_csv(path+train_file)
test_df = pd.read_csv(path+test_file)
sample_submission = pd.read_csv(path+'sample_submission_24jSKY6.csv')

In [5]:
train_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,loan_default,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,03-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,26-09-18,6,1998,1,1,0,0,0,0,598,I-Medium Risk,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,0,1,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,11
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,01-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,26-10-18,6,1998,1,1,0,0,0,0,305,L-Very High Risk,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,1,1,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,3
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,26-09-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,0,0


In [6]:
test_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,03-11-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,84.14047,10080,0.0,0.0,0.0,44.868493,1974,1,1,2018,11,3,0,0,0,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,20-11-18,6,1998,1,1,0,0,0,0,749,C-Very Low Risk,2,1,0,43898,48780,48780,0,0,0,0,0,0,5605,0,1,0,1,87.888479,7650,24390.0,24390.0,0.0,33.526027,1985,5,20,2018,11,20,0,8,1,0
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,29-11-18,4,1646,1,1,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,77.421727,19038,0.0,0.0,0.0,23.142466,1995,10,14,2018,11,29,0,0,0,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,29-11-18,4,1646,1,1,0,0,0,0,14,Not Scored: Only a Guarantor,1,1,1,132480,255000,255000,0,0,0,0,0,0,0,0,0,0,0,73.408351,16991,255000.0,255000.0,1.0,45.526027,1973,6,1,2018,11,29,2,5,2,5
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,17-11-18,4,1646,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80.487041,12468,0.0,0.0,0.0,46.493151,1972,6,1,2018,11,17,0,0,0,0


### Combine Train and Test

In [7]:
X_train = train_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
X_test = test_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
Y = train_df['loan_default']

### Concat train and test for common preprocessing
concat_df = pd.concat([X_train, X_test], keys=['train', 'test'])

In [8]:
concat_df.head()

Unnamed: 0,Unnamed: 1,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
train,0,1,1441,0,01-01-84,03-08-18,0,1998,Salaried,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,58400,67,50578,89.55,45,22807
train,1,1,1502,1,31-07-85,26-09-18,0,1998,Self employed,1,11,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,0,0,0,598,I-Medium Risk,1,27600,50200,1,1,50200,1991,0,0,0,0,0,0,0,0,6,0,65550,67,47145,73.23,45,22807
train,2,1,1497,0,24-08-85,01-08-18,0,1998,Self employed,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,61360,67,53278,89.63,45,22807
train,3,1,1501,0,30-12-93,26-10-18,0,1998,Self employed,1,3,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,0,1,0,305,L-Very High Risk,0,0,0,3,0,0,31,0,0,0,0,0,0,0,0,6,0,66113,67,57513,88.48,45,22807
train,4,1,1495,0,09-12-77,26-09-18,0,1998,Self employed,0,0,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,1,0,1,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,60300,67,52378,88.39,45,22807


In [9]:
#filling NAs with 0
concat_df.isna().sum(axis=0).reset_index().T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
index,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### More Features

In [10]:
concat_df['employee_id_branch_id'] = concat_df['branch_id'].apply(str)+"-"+concat_df['Employee_code_ID'].apply(str)

bins = [-np.inf, 20, 25, 30, 35, 40, 45, 50, np.inf]
labels = [1,2,3,4,5,6,7,8]

concat_df['F6_age_bins'] = np.asarray(pd.cut(concat_df['F6_age_at_disbursal'], bins=bins, labels=labels).values)

In [11]:
concat_df['F10.3_CREDIT.HIST_DAYS'] = (concat_df['F10.1_CREDIT.HIST_Y'] * 365) +\
                                        (concat_df['F10.2_CREDIT.HIST_M'] * 30)

concat_df['F9.3_AVG.ACCT.AGE_DAYS'] = (concat_df['F9.1_AVG.ACCT.AGE_Y'] * 365) +\
                                        (concat_df['F9.2_AVG.ACCT.AGE_M'] * 30)

concat_df['BalancePerActiveAccount'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRI.ACTIVE.ACCTS']

concat_df['PRI.NoOfInstallmentsLeft'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRIMARY.INSTAL.AMT']

concat_df['Disbursed_CurrentBalance_Diff'] = concat_df['disbursed_amount'] - concat_df['PRI.CURRENT.BALANCE']

#combine primary and secondary values
concat_df['TotalInstallAmt'] = concat_df['PRIMARY.INSTAL.AMT'] + concat_df['SEC.INSTAL.AMT']
concat_df['TotalDisbAmt'] = concat_df['PRI.DISBURSED.AMOUNT'] + concat_df['SEC.DISBURSED.AMOUNT']
concat_df['TotalCurrentBalance'] = concat_df['PRI.CURRENT.BALANCE'] + concat_df['SEC.CURRENT.BALANCE']
concat_df['TotalActiveAccts'] = concat_df['PRI.ACTIVE.ACCTS'] + concat_df['SEC.ACTIVE.ACCTS']
concat_df['TotalOverdueAccts'] = concat_df['PRI.OVERDUE.ACCTS'] + concat_df['SEC.OVERDUE.ACCTS']
concat_df['TotalAccts'] = concat_df['PRI.NO.OF.ACCTS'] + concat_df['SEC.NO.OF.ACCTS']
concat_df['TotalSancAmt'] = concat_df['PRI.SANCTIONED.AMOUNT'] + concat_df['SEC.SANCTIONED.AMOUNT']

### Dealing with NAs and Label Encoding Categorical features

In [12]:
#### replace nulls as a new category
concat_df['Employment.Type'].fillna('NA', inplace=True)

concat_df.fillna(0, inplace=True)

### replace -inf and +inf with 0

#filling infs
for i in concat_df.columns.values:
    if (len(concat_df.loc[concat_df[i] == np.inf, i]) != 0)or(len(concat_df.loc[concat_df[i] == -np.inf, i]) != 0):
        print(i)
        concat_df.loc[concat_df[i] == np.inf, i] = 0
        concat_df.loc[concat_df[i] == -np.inf, i] = 0

PRI.NoOfInstallmentsLeft


In [13]:
def label_encode_apply(df):
    if df[0] == object:
        concat_df[df['index']] = label_encoding_func(concat_df, df['index'])
        
_ = concat_df.dtypes.reset_index().apply(label_encode_apply, axis=1)
print('Done')

Done


### Split Train Test

In [14]:
X_train = concat_df.loc['train']
X_test = concat_df.loc['test']

### Yet More Features

In [15]:
### this function is for running inside cv

def generate_summed_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].sum().reset_index()
    state = train.groupby('State_ID')[variable].sum().reset_index()
    supplier = train.groupby('supplier_id')[variable].sum().reset_index()
    branch = train.groupby('branch_id')[variable].sum().reset_index()
    
    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_sum_pincode_F11.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], state, how='left', on='State_ID', 
         suffixes=('', '_sum_state_F11.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier, how='left', on='supplier_id', 
         suffixes=('', '_sum_supplier_F11.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_sum_branch_F11.4'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  


#### this function is for running inside cv

def generate_averaged_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].mean().reset_index()
    branch = train.groupby('branch_id')[variable].mean().reset_index()
    employee_code_id = train.groupby('Employee_code_ID')[variable].mean().reset_index()
    employee_code_id_branch_id = train.groupby('employee_id_branch_id')[variable].mean().reset_index()
    supplier_id = train.groupby('supplier_id')[variable].mean().reset_index()

    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_mean_pincode_F13.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_mean_branch_F13.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id, how='left', on='Employee_code_ID', 
         suffixes=('', '_mean_employeeid_F13.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id_branch_id, how='left',on='employee_id_branch_id', 
         suffixes=('', '_mean_employee_id_branch_id_F13.4'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier_id, how='left', on='supplier_id', 
         suffixes=('', '_mean_supplier_id_F13.5'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  

### Specify cols to TE inside CV and categorical cols for catboost

In [16]:
cols_to_target_encode = ['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'F6_age_bins',
                        'manufacturer_id', 'State_ID', 'Employment.Type', 'PRI.OVERDUE.ACCTS',
                        'PRI.ACTIVE.ACCTS', 'F7.1_DOB_Y', 'PERFORM_CNS.SCORE.DESCRIPTION', 'SEC.NO.OF.ACCTS',
                        'NO.OF_INQUIRIES', 'NEW.ACCTS.IN.LAST.SIX.MONTHS']

In [17]:
for i,j in enumerate(X_train.columns):
    print(i,j, end=" || ")

0 Aadhar_flag || 1 Current_pincode_ID || 2 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS || 3 Date.of.Birth || 4 DisbursalDate || 5 Driving_flag || 6 Employee_code_ID || 7 Employment.Type || 8 F10.1_CREDIT.HIST_Y || 9 F10.2_CREDIT.HIST_M || 10 F1_Manual_LTV || 11 F2_difference_asset_disbursed || 12 F3_avg_primary_sanctioned_amt || 13 F4_avg_primary_disbursed_amt || 14 F5_ratio_primary_active_overdue || 15 F6_age_at_disbursal || 16 F7.1_DOB_Y || 17 F7.2_DOB_M || 18 F7.3_DOB_D || 19 F8.1_DisDate_Y || 20 F8.2_DisDate_M || 21 F8.3_DisDate_D || 22 F9.1_AVG.ACCT.AGE_Y || 23 F9.2_AVG.ACCT.AGE_M || 24 MobileNo_Avl_Flag || 25 NEW.ACCTS.IN.LAST.SIX.MONTHS || 26 NO.OF_INQUIRIES || 27 PAN_flag || 28 PERFORM_CNS.SCORE || 29 PERFORM_CNS.SCORE.DESCRIPTION || 30 PRI.ACTIVE.ACCTS || 31 PRI.CURRENT.BALANCE || 32 PRI.DISBURSED.AMOUNT || 33 PRI.NO.OF.ACCTS || 34 PRI.OVERDUE.ACCTS || 35 PRI.SANCTIONED.AMOUNT || 36 PRIMARY.INSTAL.AMT || 37 Passport_flag || 38 SEC.ACTIVE.ACCTS || 39 SEC.CURRENT.BALANCE || 40 SEC.DISBU

In [18]:
categorical_col_indices = [0, 1, 17, 18, 5, 8, 9, 37, 46, 6, 52, 48, 63, 64]

### Model with 10 Fold CV

In [19]:
strf_split = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [20]:
train_preds_list_oof_semi_stacking = []
val_auc_scores = []
test_preds_list = []

all_train_predictions = np.zeros([X_train.shape[0]])

cv_counter = 1

for train_idx, val_idx in strf_split.split(X_train, Y):
    print("***************** ", cv_counter, " *****************", end="\n\n")
    
    t_x = X_train.iloc[train_idx]
    v_x = X_train.iloc[val_idx]
    
    t_y = Y[train_idx]
    v_y = Y[val_idx]
    
    test_x = X_test.copy()

    print('Generating New Features: ')
    #generate new features
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'ltv')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.CURRENT.BALANCE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'disbursed_amount')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'asset_cost')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRIMARY.INSTAL.AMT')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PERFORM_CNS.SCORE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F4_avg_primary_disbursed_amt')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F2_difference_asset_disbursed')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.NO.OF.ACCTS')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F6_age_at_disbursal')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalCurrentBalance')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalInstallAmt')
    
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.ACTIVE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.OVERDUE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'NO.OF_INQUIRIES')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalActiveAccts')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalOverdueAccts')
    
    
    print('Target Encoding: ')
    t_x, v_x, test_x = perform_target_encoding(cols_to_target_encode, t_x, t_y, v_x, test_x)
    
    
    print("Train Shape: ", t_x.shape)
    print("Val Shape: ", v_x.shape)
    print("Test Shape: ", test_x.shape, end="\n\n")
    
    params = {
            'eval_metric': 'AUC',
            'learning_rate': 0.01,
            'random_seed': 12321,
            'l2_leaf_reg': 15,
            'bootstrap_type': 'Bernoulli',
            #'bagging_temperature': 0.3,
            'subsample': 0.5,
            'max_depth': 8,
            'feature_border_type': 'MinEntropy',
            'thread_count': 4, 
            'objective': 'CrossEntropy',
            #'min_data_in_leaf': 100,
            'task_type': 'GPU',
            'od_type': 'Iter'
        }

    #print(t_x.iloc[:,categorical_col_indices])
    
    dtrain = cb.Pool(t_x, label=t_y, cat_features=categorical_col_indices)
    dvalid = cb.Pool(v_x, label=v_y, cat_features=categorical_col_indices)
    dtest = cb.Pool(test_x, cat_features=categorical_col_indices)
        
    model = cb.train(dtrain=dtrain, params = params, num_boost_round=8000, eval_set=[dvalid], early_stopping_rounds=500, 
        verbose_eval=200) 
    
    val_preds = model.predict(dvalid, prediction_type='Probability')
    val_score = roc_auc_score(v_y, val_preds[:,1])
    
    print(val_score)
    
    val_auc_scores.append(val_score)
    
        
    test_preds = model.predict(dtest, prediction_type='Probability')
    test_preds_list.append(test_preds[:,1])
    
    all_train_predictions[val_idx] = val_preds[:,1]
    
    cv_counter+=1
    
    print("============"*8, end="\n\n")

*****************  1  *****************

Generating New Features: 


  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.50s/it]


Train Shape:  (209837, 151)
Val Shape:  (23317, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6180100	test: 0.6252246	best: 0.6252246 (0)	total: 68.5ms	remaining: 9m 7s
200:	learn: 0.6702284	test: 0.6701829	best: 0.6701829 (200)	total: 14.2s	remaining: 9m 12s
400:	learn: 0.6809058	test: 0.6767584	best: 0.6767845 (399)	total: 28.3s	remaining: 8m 57s
600:	learn: 0.6875666	test: 0.6795236	best: 0.6795236 (600)	total: 42.2s	remaining: 8m 39s
800:	learn: 0.6932673	test: 0.6814446	best: 0.6814446 (800)	total: 56.6s	remaining: 8m 28s
1000:	learn: 0.6978463	test: 0.6825634	best: 0.6825634 (1000)	total: 1m 10s	remaining: 8m 12s
1200:	learn: 0.7024555	test: 0.6834118	best: 0.6834118 (1200)	total: 1m 24s	remaining: 7m 59s
1400:	learn: 0.7074522	test: 0.6843370	best: 0.6843370 (1400)	total: 1m 38s	remaining: 7m 44s
1600:	learn: 0.7123762	test: 0.6849563	best: 0.6849563 (1600)	total: 1m 52s	remaining: 7m 29s
1800:	learn: 0.7170762	test: 0.6854098	best: 0.6854280 (1794)	total: 2m 6s	remaining: 7m 15s

  0%|          | 0/12 [00:00<?, ?it/s]

Generating New Features: 
Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.45s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6102888	test: 0.6153273	best: 0.6153273 (0)	total: 69.8ms	remaining: 9m 18s
200:	learn: 0.6690076	test: 0.6680637	best: 0.6680637 (200)	total: 13.9s	remaining: 8m 59s
400:	learn: 0.6809852	test: 0.6750785	best: 0.6750785 (400)	total: 27.9s	remaining: 8m 49s
600:	learn: 0.6883347	test: 0.6780111	best: 0.6780111 (600)	total: 41.8s	remaining: 8m 35s
800:	learn: 0.6940524	test: 0.6795644	best: 0.6795644 (800)	total: 55.7s	remaining: 8m 20s
1000:	learn: 0.6986986	test: 0.6806423	best: 0.6806423 (1000)	total: 1m 9s	remaining: 8m 8s
1200:	learn: 0.7036362	test: 0.6815943	best: 0.6815943 (1200)	total: 1m 23s	remaining: 7m 54s
1400:	learn: 0.7086558	test: 0.6824106	best: 0.6824106 (1400)	total: 1m 37s	remaining: 7m 40s
1600:	learn: 0.7138055	test: 0.6831146	best: 0.6831146 (1600)	total: 1m 51s	remaining: 7m 26s
1800:	learn: 0.7183977	test: 0.6835742	best: 0.6835742 (1800)	total: 2m 5s	remaining: 7m 12s


  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.50s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6151246	test: 0.6113947	best: 0.6113947 (0)	total: 70.2ms	remaining: 9m 21s
200:	learn: 0.6697018	test: 0.6589148	best: 0.6589148 (200)	total: 14s	remaining: 9m 2s
400:	learn: 0.6812059	test: 0.6661153	best: 0.6661153 (400)	total: 28.2s	remaining: 8m 53s
600:	learn: 0.6882752	test: 0.6691767	best: 0.6691822 (598)	total: 42.5s	remaining: 8m 43s
800:	learn: 0.6936615	test: 0.6710375	best: 0.6710375 (800)	total: 56.9s	remaining: 8m 31s
1000:	learn: 0.6986732	test: 0.6721749	best: 0.6721749 (1000)	total: 1m 11s	remaining: 8m 17s
1200:	learn: 0.7035113	test: 0.6731873	best: 0.6731885 (1199)	total: 1m 25s	remaining: 8m 3s
1400:	learn: 0.7085789	test: 0.6738767	best: 0.6738767 (1400)	total: 1m 39s	remaining: 7m 50s
1600:	learn: 0.7136098	test: 0.6745226	best: 0.6745348 (1598)	total: 1m 54s	remaining: 7m 36s
1800:	learn: 0.7184945	test: 0.6751563	best: 0.6751576 (1798)	total: 2m 8s	remaining: 7m 21s
20

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.45s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6212036	test: 0.6221878	best: 0.6221878 (0)	total: 69.8ms	remaining: 9m 18s
200:	learn: 0.6689574	test: 0.6612124	best: 0.6612124 (200)	total: 15.2s	remaining: 9m 49s
400:	learn: 0.6802546	test: 0.6686067	best: 0.6686067 (400)	total: 29.9s	remaining: 9m 25s
600:	learn: 0.6877683	test: 0.6717388	best: 0.6717388 (600)	total: 44.5s	remaining: 9m 8s
800:	learn: 0.6933571	test: 0.6738038	best: 0.6738043 (799)	total: 59.2s	remaining: 8m 52s
1000:	learn: 0.6981043	test: 0.6751288	best: 0.6751288 (1000)	total: 1m 13s	remaining: 8m 35s
1200:	learn: 0.7028378	test: 0.6762169	best: 0.6762198 (1198)	total: 1m 28s	remaining: 8m 20s
1400:	learn: 0.7077841	test: 0.6770426	best: 0.6770478 (1398)	total: 1m 43s	remaining: 8m 6s
1600:	learn: 0.7128736	test: 0.6778115	best: 0.6778134 (1599)	total: 1m 57s	remaining: 7m 50s
1800:	learn: 0.7176502	test: 0.6783609	best: 0.6783609 (1800)	total: 2m 11s	remaining: 7m 34s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:17<00:00,  1.54s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6207829	test: 0.6270255	best: 0.6270255 (0)	total: 74.4ms	remaining: 9m 55s
200:	learn: 0.6694131	test: 0.6617967	best: 0.6617967 (200)	total: 14.4s	remaining: 9m 19s
400:	learn: 0.6808850	test: 0.6668075	best: 0.6668075 (400)	total: 29.8s	remaining: 9m 25s
600:	learn: 0.6883122	test: 0.6692252	best: 0.6692252 (600)	total: 44.5s	remaining: 9m 7s
800:	learn: 0.6940648	test: 0.6708897	best: 0.6708897 (800)	total: 59.4s	remaining: 8m 54s
1000:	learn: 0.6990148	test: 0.6721312	best: 0.6721312 (1000)	total: 1m 13s	remaining: 8m 34s
1200:	learn: 0.7037815	test: 0.6729922	best: 0.6729922 (1200)	total: 1m 27s	remaining: 8m 16s
1400:	learn: 0.7090565	test: 0.6737573	best: 0.6737693 (1396)	total: 1m 41s	remaining: 7m 59s
1600:	learn: 0.7141429	test: 0.6743441	best: 0.6743517 (1592)	total: 1m 56s	remaining: 7m 44s
1800:	learn: 0.7189742	test: 0.6747607	best: 0.6747607 (1800)	total: 2m 10s	remaining: 7m 29

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.46s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6150211	test: 0.6154024	best: 0.6154024 (0)	total: 74.5ms	remaining: 9m 56s
200:	learn: 0.6683766	test: 0.6611182	best: 0.6611182 (200)	total: 14.1s	remaining: 9m 6s
400:	learn: 0.6803663	test: 0.6692896	best: 0.6692896 (400)	total: 28.7s	remaining: 9m 3s
600:	learn: 0.6875448	test: 0.6725307	best: 0.6725458 (599)	total: 42.9s	remaining: 8m 48s
800:	learn: 0.6933505	test: 0.6747508	best: 0.6747508 (800)	total: 57s	remaining: 8m 32s
1000:	learn: 0.6982534	test: 0.6761056	best: 0.6761141 (998)	total: 1m 11s	remaining: 8m 20s
1200:	learn: 0.7030725	test: 0.6773573	best: 0.6773605 (1199)	total: 1m 25s	remaining: 8m 3s
1400:	learn: 0.7082370	test: 0.6784204	best: 0.6784212 (1397)	total: 1m 39s	remaining: 7m 49s
1600:	learn: 0.7133025	test: 0.6792787	best: 0.6792872 (1599)	total: 1m 54s	remaining: 7m 36s
1800:	learn: 0.7181502	test: 0.6801305	best: 0.6801305 (1800)	total: 2m 8s	remaining: 7m 22s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.52s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6190936	test: 0.6258475	best: 0.6258475 (0)	total: 74.9ms	remaining: 9m 59s
200:	learn: 0.6682376	test: 0.6711459	best: 0.6711459 (200)	total: 14.3s	remaining: 9m 14s
400:	learn: 0.6792615	test: 0.6791327	best: 0.6791327 (400)	total: 28.7s	remaining: 9m 3s
600:	learn: 0.6866322	test: 0.6826566	best: 0.6826566 (600)	total: 43.2s	remaining: 8m 52s
800:	learn: 0.6924866	test: 0.6847713	best: 0.6847713 (800)	total: 58.1s	remaining: 8m 42s
1000:	learn: 0.6973137	test: 0.6861113	best: 0.6861135 (999)	total: 1m 12s	remaining: 8m 27s
1200:	learn: 0.7020875	test: 0.6870045	best: 0.6870070 (1197)	total: 1m 27s	remaining: 8m 13s
1400:	learn: 0.7071172	test: 0.6878608	best: 0.6878608 (1400)	total: 1m 41s	remaining: 7m 58s
1600:	learn: 0.7122749	test: 0.6885542	best: 0.6885663 (1597)	total: 1m 56s	remaining: 7m 43s
1800:	learn: 0.7171618	test: 0.6890279	best: 0.6890367 (1798)	total: 2m 10s	remaining: 7m 29s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.43s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6092481	test: 0.6123896	best: 0.6123896 (0)	total: 77.4ms	remaining: 10m 19s
200:	learn: 0.6699985	test: 0.6711140	best: 0.6711140 (200)	total: 14.4s	remaining: 9m 20s
400:	learn: 0.6802455	test: 0.6777788	best: 0.6777788 (400)	total: 29.3s	remaining: 9m 16s
600:	learn: 0.6871257	test: 0.6812608	best: 0.6812608 (600)	total: 43.8s	remaining: 8m 59s
800:	learn: 0.6928342	test: 0.6834083	best: 0.6834083 (800)	total: 57.8s	remaining: 8m 39s
1000:	learn: 0.6977021	test: 0.6849452	best: 0.6849452 (1000)	total: 1m 11s	remaining: 8m 22s
1200:	learn: 0.7021787	test: 0.6859623	best: 0.6859623 (1200)	total: 1m 25s	remaining: 8m 5s
1400:	learn: 0.7072778	test: 0.6868327	best: 0.6868386 (1398)	total: 1m 40s	remaining: 7m 52s
1600:	learn: 0.7123730	test: 0.6875980	best: 0.6876197 (1596)	total: 1m 54s	remaining: 7m 37s
1800:	learn: 0.7170531	test: 0.6882201	best: 0.6882201 (1800)	total: 2m 8s	remaining: 7m 23

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:17<00:00,  1.55s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6106674	test: 0.6116008	best: 0.6116008 (0)	total: 73ms	remaining: 9m 43s
200:	learn: 0.6706093	test: 0.6604221	best: 0.6604221 (200)	total: 14s	remaining: 9m 4s
400:	learn: 0.6815394	test: 0.6676425	best: 0.6676505 (399)	total: 28.2s	remaining: 8m 55s
600:	learn: 0.6885951	test: 0.6710136	best: 0.6710136 (600)	total: 42.5s	remaining: 8m 43s
800:	learn: 0.6940038	test: 0.6728531	best: 0.6728531 (800)	total: 57.1s	remaining: 8m 32s
1000:	learn: 0.6987375	test: 0.6741627	best: 0.6741627 (1000)	total: 1m 11s	remaining: 8m 19s
1200:	learn: 0.7033435	test: 0.6751811	best: 0.6751896 (1198)	total: 1m 25s	remaining: 8m 6s
1400:	learn: 0.7082403	test: 0.6761177	best: 0.6761177 (1400)	total: 1m 40s	remaining: 7m 53s
1600:	learn: 0.7132990	test: 0.6769377	best: 0.6769462 (1599)	total: 1m 54s	remaining: 7m 38s
1800:	learn: 0.7178195	test: 0.6774698	best: 0.6774760 (1790)	total: 2m 9s	remaining: 7m 24s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:17<00:00,  1.54s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6103095	test: 0.6204197	best: 0.6204197 (0)	total: 75.4ms	remaining: 10m 3s
200:	learn: 0.6687248	test: 0.6687883	best: 0.6687883 (200)	total: 14.2s	remaining: 9m 9s
400:	learn: 0.6802566	test: 0.6752101	best: 0.6752101 (400)	total: 28.4s	remaining: 8m 59s
600:	learn: 0.6873034	test: 0.6776230	best: 0.6776230 (600)	total: 42.7s	remaining: 8m 45s
800:	learn: 0.6930853	test: 0.6792793	best: 0.6792924 (797)	total: 57.2s	remaining: 8m 33s
1000:	learn: 0.6979759	test: 0.6802434	best: 0.6802434 (1000)	total: 1m 11s	remaining: 8m 17s
1200:	learn: 0.7027317	test: 0.6810288	best: 0.6810305 (1198)	total: 1m 25s	remaining: 8m 1s
1400:	learn: 0.7077017	test: 0.6816406	best: 0.6816474 (1390)	total: 1m 39s	remaining: 7m 47s
1600:	learn: 0.7129839	test: 0.6824879	best: 0.6824922 (1599)	total: 1m 53s	remaining: 7m 33s
1800:	learn: 0.7177700	test: 0.6829760	best: 0.6829760 (1800)	total: 2m 7s	remaining: 7m 19s


In [21]:
print("CV Score: ", np.mean(val_auc_scores))

CV Score:  0.6842249174664217


In [22]:
### Combine all CV preds for test
test_preds_cv = pd.DataFrame(np.asarray(test_preds_list).T).mean(axis=1).values

In [23]:
sample_submission['loan_default'] = test_preds_cv
#sample_submission['loan_default'] = sample_submission['loan_default'].rank(pct=True)
sample_submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.299433
1,723482,0.345351
2,758529,0.344433
3,763449,0.372676
4,708663,0.301643


In [24]:
train_oof_preds = train_df[['UniqueID', 'loan_default']].copy()
train_oof_preds['loan_default'] = all_train_predictions
train_oof_preds.head()

Unnamed: 0,UniqueID,loan_default
0,420825,0.424848
1,537409,0.319894
2,417566,0.384729
3,624493,0.440392
4,539055,0.310727


In [25]:
sample_submission.to_csv('cb2.csv', index=False)
#train_oof_preds.to_csv('cb2_Train.csv', index=False)