In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from collections import Counter
import math
import random
from tqdm import tqdm

#%matplotlib inline

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing
from category_encoders import target_encoder, TargetEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler, SMOTE
import catboost as cb

pd.set_option('display.max_columns', 200)

Using TensorFlow backend.


In [3]:
def label_encoding_func(df_name, df_col_name):
    '''
    usage: dataframe[column_name] = label_encoding_function(dataframe, column_name)
    '''
    le = preprocessing.LabelEncoder()
    le.fit(df_name[df_col_name])
    return le.transform(df_name[df_col_name])

def do_one_hot_encoding(df_name, df_column_name, suffix=''):
    '''
    usage: dataframe[column_name] = do_one_hot_encoding(dataframe, column_name, suffix_for_column_name)
    '''
    x = pd.get_dummies(df_name[df_column_name])
    df_name = df_name.join(x, lsuffix=suffix)
    df_name = df_name.drop(df_column_name, axis=1) 
    return df_name

#function for perform target encoding later on
def perform_target_encoding(columns, X, Y, X_Val, X_Test):
    for i in tqdm(columns):
        target_enc = TargetEncoder(cols=[i], smoothing=3)
        target_enc_fit = target_enc.fit(X, Y)
        X[i] = target_enc.transform(X, Y)[i]
        X_Val[i] = target_enc.transform(X_Val)[i]
        X_Test[i] = target_enc.transform(X_Test)[i]
        
    return X, X_Val, X_Test

### Load Feature Engineered Datasets

In [2]:
path = '../../feature_engineering_eda_data/' 
train_file = 'train_feature_engineered_V2.csv'
test_file = 'test_feature_engineered_V2.csv'

train_df = pd.read_csv(path+train_file)
test_df = pd.read_csv(path+test_file)
sample_submission = pd.read_csv(path+'sample_submission_24jSKY6.csv')

In [5]:
train_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,loan_default,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,03-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,26-09-18,6,1998,1,1,0,0,0,0,598,I-Medium Risk,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,0,1,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,11
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,01-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,26-10-18,6,1998,1,1,0,0,0,0,305,L-Very High Risk,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,1,1,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,3
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,26-09-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,0,0


In [6]:
test_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,03-11-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,84.14047,10080,0.0,0.0,0.0,44.868493,1974,1,1,2018,11,3,0,0,0,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,20-11-18,6,1998,1,1,0,0,0,0,749,C-Very Low Risk,2,1,0,43898,48780,48780,0,0,0,0,0,0,5605,0,1,0,1,87.888479,7650,24390.0,24390.0,0.0,33.526027,1985,5,20,2018,11,20,0,8,1,0
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,29-11-18,4,1646,1,1,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,77.421727,19038,0.0,0.0,0.0,23.142466,1995,10,14,2018,11,29,0,0,0,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,29-11-18,4,1646,1,1,0,0,0,0,14,Not Scored: Only a Guarantor,1,1,1,132480,255000,255000,0,0,0,0,0,0,0,0,0,0,0,73.408351,16991,255000.0,255000.0,1.0,45.526027,1973,6,1,2018,11,29,2,5,2,5
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,17-11-18,4,1646,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80.487041,12468,0.0,0.0,0.0,46.493151,1972,6,1,2018,11,17,0,0,0,0


### Combine Train and Test

In [7]:
X_train = train_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
X_test = test_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
Y = train_df['loan_default']

### Concat train and test for common preprocessing
concat_df = pd.concat([X_train, X_test], keys=['train', 'test'])

In [8]:
concat_df.head()

Unnamed: 0,Unnamed: 1,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
train,0,1,1441,0,01-01-84,03-08-18,0,1998,Salaried,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,58400,67,50578,89.55,45,22807
train,1,1,1502,1,31-07-85,26-09-18,0,1998,Self employed,1,11,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,0,0,0,598,I-Medium Risk,1,27600,50200,1,1,50200,1991,0,0,0,0,0,0,0,0,6,0,65550,67,47145,73.23,45,22807
train,2,1,1497,0,24-08-85,01-08-18,0,1998,Self employed,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,61360,67,53278,89.63,45,22807
train,3,1,1501,0,30-12-93,26-10-18,0,1998,Self employed,1,3,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,0,1,0,305,L-Very High Risk,0,0,0,3,0,0,31,0,0,0,0,0,0,0,0,6,0,66113,67,57513,88.48,45,22807
train,4,1,1495,0,09-12-77,26-09-18,0,1998,Self employed,0,0,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,1,0,1,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,60300,67,52378,88.39,45,22807


In [9]:
#filling NAs with 0
concat_df.isna().sum(axis=0).reset_index().T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
index,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### More Features

In [10]:
concat_df['employee_id_branch_id'] = concat_df['branch_id'].apply(str)+"-"+concat_df['Employee_code_ID'].apply(str)

bins = [-np.inf, 20, 25, 30, 35, 40, 45, 50, np.inf]
labels = [1,2,3,4,5,6,7,8]

concat_df['F6_age_bins'] = np.asarray(pd.cut(concat_df['F6_age_at_disbursal'], bins=bins, labels=labels).values)

In [11]:
concat_df['F10.3_CREDIT.HIST_DAYS'] = (concat_df['F10.1_CREDIT.HIST_Y'] * 365) +\
                                        (concat_df['F10.2_CREDIT.HIST_M'] * 30)

concat_df['F9.3_AVG.ACCT.AGE_DAYS'] = (concat_df['F9.1_AVG.ACCT.AGE_Y'] * 365) +\
                                        (concat_df['F9.2_AVG.ACCT.AGE_M'] * 30)

concat_df['BalancePerActiveAccount'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRI.ACTIVE.ACCTS']

concat_df['PRI.NoOfInstallmentsLeft'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRIMARY.INSTAL.AMT']

concat_df['Disbursed_CurrentBalance_Diff'] = concat_df['disbursed_amount'] - concat_df['PRI.CURRENT.BALANCE']

#combine primary and secondary values
concat_df['TotalInstallAmt'] = concat_df['PRIMARY.INSTAL.AMT'] + concat_df['SEC.INSTAL.AMT']
concat_df['TotalDisbAmt'] = concat_df['PRI.DISBURSED.AMOUNT'] + concat_df['SEC.DISBURSED.AMOUNT']
concat_df['TotalCurrentBalance'] = concat_df['PRI.CURRENT.BALANCE'] + concat_df['SEC.CURRENT.BALANCE']
concat_df['TotalActiveAccts'] = concat_df['PRI.ACTIVE.ACCTS'] + concat_df['SEC.ACTIVE.ACCTS']
concat_df['TotalOverdueAccts'] = concat_df['PRI.OVERDUE.ACCTS'] + concat_df['SEC.OVERDUE.ACCTS']
concat_df['TotalAccts'] = concat_df['PRI.NO.OF.ACCTS'] + concat_df['SEC.NO.OF.ACCTS']
concat_df['TotalSancAmt'] = concat_df['PRI.SANCTIONED.AMOUNT'] + concat_df['SEC.SANCTIONED.AMOUNT']

### Dealing with NAs and Label Encoding Categorical features

In [12]:
#### replace nulls as a new category
concat_df['Employment.Type'].fillna('NA', inplace=True)

concat_df.fillna(0, inplace=True)

### replace -inf and +inf with 0

#filling infs
for i in concat_df.columns.values:
    if (len(concat_df.loc[concat_df[i] == np.inf, i]) != 0)or(len(concat_df.loc[concat_df[i] == -np.inf, i]) != 0):
        print(i)
        concat_df.loc[concat_df[i] == np.inf, i] = 0
        concat_df.loc[concat_df[i] == -np.inf, i] = 0

PRI.NoOfInstallmentsLeft


In [13]:
def label_encode_apply(df):
    if df[0] == object:
        concat_df[df['index']] = label_encoding_func(concat_df, df['index'])
        
_ = concat_df.dtypes.reset_index().apply(label_encode_apply, axis=1)
print('Done')

Done


### Split Train Test

In [14]:
X_train = concat_df.loc['train']
X_test = concat_df.loc['test']

### Yet More Features

In [15]:
### this function is for running inside cv

def generate_summed_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].sum().reset_index()
    state = train.groupby('State_ID')[variable].sum().reset_index()
    supplier = train.groupby('supplier_id')[variable].sum().reset_index()
    branch = train.groupby('branch_id')[variable].sum().reset_index()
    
    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_sum_pincode_F11.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], state, how='left', on='State_ID', 
         suffixes=('', '_sum_state_F11.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier, how='left', on='supplier_id', 
         suffixes=('', '_sum_supplier_F11.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_sum_branch_F11.4'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  


#### this function is for running inside cv

def generate_averaged_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].mean().reset_index()
    branch = train.groupby('branch_id')[variable].mean().reset_index()
    employee_code_id = train.groupby('Employee_code_ID')[variable].mean().reset_index()
    employee_code_id_branch_id = train.groupby('employee_id_branch_id')[variable].mean().reset_index()
    supplier_id = train.groupby('supplier_id')[variable].mean().reset_index()

    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_mean_pincode_F13.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_mean_branch_F13.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id, how='left', on='Employee_code_ID', 
         suffixes=('', '_mean_employeeid_F13.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id_branch_id, how='left',on='employee_id_branch_id', 
         suffixes=('', '_mean_employee_id_branch_id_F13.4'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier_id, how='left', on='supplier_id', 
         suffixes=('', '_mean_supplier_id_F13.5'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  

### Specify cols to TE inside CV and categorical cols for catboost

In [16]:
cols_to_target_encode = ['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'F6_age_bins',
                        'manufacturer_id', 'State_ID', 'Employment.Type', 'PRI.OVERDUE.ACCTS',
                        'PRI.ACTIVE.ACCTS', 'F7.1_DOB_Y', 'PERFORM_CNS.SCORE.DESCRIPTION', 'SEC.NO.OF.ACCTS',
                        'NO.OF_INQUIRIES', 'NEW.ACCTS.IN.LAST.SIX.MONTHS']

In [17]:
for i,j in enumerate(X_train.columns):
    print(i," --> ", j, end=" || ")

0  -->  Aadhar_flag || 1  -->  Current_pincode_ID || 2  -->  DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS || 3  -->  Date.of.Birth || 4  -->  DisbursalDate || 5  -->  Driving_flag || 6  -->  Employee_code_ID || 7  -->  Employment.Type || 8  -->  F10.1_CREDIT.HIST_Y || 9  -->  F10.2_CREDIT.HIST_M || 10  -->  F1_Manual_LTV || 11  -->  F2_difference_asset_disbursed || 12  -->  F3_avg_primary_sanctioned_amt || 13  -->  F4_avg_primary_disbursed_amt || 14  -->  F5_ratio_primary_active_overdue || 15  -->  F6_age_at_disbursal || 16  -->  F7.1_DOB_Y || 17  -->  F7.2_DOB_M || 18  -->  F7.3_DOB_D || 19  -->  F8.1_DisDate_Y || 20  -->  F8.2_DisDate_M || 21  -->  F8.3_DisDate_D || 22  -->  F9.1_AVG.ACCT.AGE_Y || 23  -->  F9.2_AVG.ACCT.AGE_M || 24  -->  MobileNo_Avl_Flag || 25  -->  NEW.ACCTS.IN.LAST.SIX.MONTHS || 26  -->  NO.OF_INQUIRIES || 27  -->  PAN_flag || 28  -->  PERFORM_CNS.SCORE || 29  -->  PERFORM_CNS.SCORE.DESCRIPTION || 30  -->  PRI.ACTIVE.ACCTS || 31  -->  PRI.CURRENT.BALANCE || 32  -->  PRI.DI

In [18]:
categorical_col_indices = [0, 1, 17, 18, 5, 8, 9, 37, 46, 6, 52, 48, 64]

### Model with 10 Fold CV

In [19]:
strf_split = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [20]:
train_preds_list_oof_semi_stacking = []
val_auc_scores = []
test_preds_list = []

all_train_predictions = np.zeros([X_train.shape[0]])

cv_counter = 1

for train_idx, val_idx in strf_split.split(X_train, Y):
    print("***************** ", cv_counter, " *****************", end="\n\n")
    
    t_x = X_train.iloc[train_idx]
    v_x = X_train.iloc[val_idx]
    
    t_y = Y[train_idx]
    v_y = Y[val_idx]
    
    test_x = X_test.copy()

    print('Generating New Features: ')
    #generate new features
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'ltv')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.CURRENT.BALANCE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'disbursed_amount')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'asset_cost')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRIMARY.INSTAL.AMT')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PERFORM_CNS.SCORE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F4_avg_primary_disbursed_amt')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F2_difference_asset_disbursed')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.NO.OF.ACCTS')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F6_age_at_disbursal')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalCurrentBalance')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalInstallAmt')
    
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.ACTIVE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.OVERDUE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'NO.OF_INQUIRIES')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalActiveAccts')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalOverdueAccts')
    
    
    print('Target Encoding: ')
    t_x, v_x, test_x = perform_target_encoding(cols_to_target_encode, t_x, t_y, v_x, test_x)
    
    
    print("Train Shape: ", t_x.shape)
    print("Val Shape: ", v_x.shape)
    print("Test Shape: ", test_x.shape, end="\n\n")
    
    params = {
            'eval_metric': 'AUC',
            'learning_rate': 0.01,
            'random_seed': 12321,
            'l2_leaf_reg': 15,
            'bootstrap_type': 'Bernoulli',
            #'bagging_temperature': 0.3,
            'subsample': 0.5,
            'max_depth': 8,
            'feature_border_type': 'MinEntropy',
            'thread_count': 4, 
            'objective': 'CrossEntropy',
            #'min_data_in_leaf': 100,
            'task_type': 'GPU',
            'od_type': 'Iter',
            'allow_writing_files': False,
            'boosting_type': 'Plain'
        }

    #print(t_x.iloc[:,categorical_col_indices])
    
    dtrain = cb.Pool(t_x, label=t_y, cat_features=categorical_col_indices)
    dvalid = cb.Pool(v_x, label=v_y, cat_features=categorical_col_indices)
    dtest = cb.Pool(test_x, cat_features=categorical_col_indices)
        
    model = cb.train(dtrain=dtrain, params = params, num_boost_round=8000, eval_set=[dvalid], early_stopping_rounds=500, 
        verbose_eval=200) 
    
    val_preds = model.predict(dvalid, prediction_type='Probability')
    val_score = roc_auc_score(v_y, val_preds[:,1])
    
    print(val_score)
    
    val_auc_scores.append(val_score)
    
        
    test_preds = model.predict(dtest, prediction_type='Probability')
    test_preds_list.append(test_preds[:,1])
    
    all_train_predictions[val_idx] = val_preds[:,1]
    
    cv_counter+=1
    
    print("============"*8, end="\n\n")
    del t_x, v_x, test_x, model
    gc.collect()

*****************  1  *****************

Generating New Features: 


  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.25s/it]


Train Shape:  (209837, 151)
Val Shape:  (23317, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6244942	test: 0.6338793	best: 0.6338793 (0)	total: 64.9ms	remaining: 8m 39s
200:	learn: 0.6699936	test: 0.6698439	best: 0.6698439 (200)	total: 13.2s	remaining: 8m 30s
400:	learn: 0.6803455	test: 0.6759052	best: 0.6759052 (400)	total: 26.5s	remaining: 8m 22s
600:	learn: 0.6876432	test: 0.6792884	best: 0.6792884 (600)	total: 39.9s	remaining: 8m 11s
800:	learn: 0.6931514	test: 0.6809577	best: 0.6809577 (800)	total: 53.2s	remaining: 7m 57s
1000:	learn: 0.6979028	test: 0.6820707	best: 0.6820707 (1000)	total: 1m 6s	remaining: 7m 43s
1200:	learn: 0.7025376	test: 0.6830864	best: 0.6830902 (1199)	total: 1m 19s	remaining: 7m 29s
1400:	learn: 0.7073649	test: 0.6840283	best: 0.6840283 (1400)	total: 1m 32s	remaining: 7m 15s
1600:	learn: 0.7125248	test: 0.6848480	best: 0.6848480 (1600)	total: 1m 45s	remaining: 7m 2s
1800:	learn: 0.7171521	test: 0.6853914	best: 0.6853914 (1800)	total: 1m 59s	remaining: 6m 50s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.13s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6102888	test: 0.6153273	best: 0.6153273 (0)	total: 68.4ms	remaining: 9m 7s
200:	learn: 0.6689389	test: 0.6684332	best: 0.6684332 (200)	total: 13.2s	remaining: 8m 33s
400:	learn: 0.6808148	test: 0.6752353	best: 0.6752353 (400)	total: 26.6s	remaining: 8m 24s
600:	learn: 0.6883148	test: 0.6780842	best: 0.6780887 (598)	total: 40.1s	remaining: 8m 13s
800:	learn: 0.6940496	test: 0.6797667	best: 0.6797692 (798)	total: 53.5s	remaining: 8m
1000:	learn: 0.6988723	test: 0.6807849	best: 0.6807849 (1000)	total: 1m 6s	remaining: 7m 47s
1200:	learn: 0.7037901	test: 0.6817763	best: 0.6817763 (1200)	total: 1m 20s	remaining: 7m 33s
1400:	learn: 0.7088982	test: 0.6825740	best: 0.6825740 (1400)	total: 1m 33s	remaining: 7m 20s
1600:	learn: 0.7140447	test: 0.6832923	best: 0.6832993 (1594)	total: 1m 46s	remaining: 7m 7s
1800:	learn: 0.7186891	test: 0.6838377	best: 0.6838405 (1782)	total: 2m	remaining: 6m 53s
2000:	le

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.12s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6220968	test: 0.6183953	best: 0.6183953 (0)	total: 66.4ms	remaining: 8m 51s
200:	learn: 0.6695985	test: 0.6589599	best: 0.6589599 (200)	total: 13.3s	remaining: 8m 34s
400:	learn: 0.6809765	test: 0.6658550	best: 0.6658600 (398)	total: 26.9s	remaining: 8m 29s
600:	learn: 0.6883375	test: 0.6691656	best: 0.6691656 (600)	total: 40.9s	remaining: 8m 23s
800:	learn: 0.6938695	test: 0.6708945	best: 0.6709011 (799)	total: 54.5s	remaining: 8m 9s
1000:	learn: 0.6988212	test: 0.6721921	best: 0.6721921 (1000)	total: 1m 7s	remaining: 7m 55s
1200:	learn: 0.7033997	test: 0.6730093	best: 0.6730138 (1199)	total: 1m 21s	remaining: 7m 40s
1400:	learn: 0.7082689	test: 0.6738732	best: 0.6738740 (1397)	total: 1m 34s	remaining: 7m 27s
1600:	learn: 0.7134794	test: 0.6746529	best: 0.6746589 (1598)	total: 1m 48s	remaining: 7m 14s
1800:	learn: 0.7183807	test: 0.6752089	best: 0.6752117 (1799)	total: 2m 2s	remaining: 7m 2s
2

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.19s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6166383	test: 0.6209838	best: 0.6209838 (0)	total: 65.7ms	remaining: 8m 45s
200:	learn: 0.6691271	test: 0.6615986	best: 0.6615986 (200)	total: 13.3s	remaining: 8m 37s
400:	learn: 0.6807185	test: 0.6688839	best: 0.6688839 (400)	total: 27s	remaining: 8m 32s
600:	learn: 0.6880913	test: 0.6719526	best: 0.6719526 (600)	total: 40.3s	remaining: 8m 16s
800:	learn: 0.6936498	test: 0.6739376	best: 0.6739446 (799)	total: 54s	remaining: 8m 5s
1000:	learn: 0.6984635	test: 0.6753186	best: 0.6753186 (1000)	total: 1m 7s	remaining: 7m 49s
1200:	learn: 0.7032313	test: 0.6763625	best: 0.6763716 (1196)	total: 1m 20s	remaining: 7m 35s
1400:	learn: 0.7081072	test: 0.6774299	best: 0.6774299 (1400)	total: 1m 33s	remaining: 7m 20s
1600:	learn: 0.7131472	test: 0.6782510	best: 0.6782561 (1594)	total: 1m 47s	remaining: 7m 7s
1800:	learn: 0.7179919	test: 0.6789840	best: 0.6789840 (1800)	total: 2m	remaining: 6m 53s
2000:	le

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.25s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6274616	test: 0.6292401	best: 0.6292401 (0)	total: 64.8ms	remaining: 8m 38s
200:	learn: 0.6694487	test: 0.6618757	best: 0.6618757 (200)	total: 13.1s	remaining: 8m 28s
400:	learn: 0.6807715	test: 0.6667009	best: 0.6667122 (396)	total: 26.4s	remaining: 8m 20s
600:	learn: 0.6880373	test: 0.6691010	best: 0.6691010 (600)	total: 39.6s	remaining: 8m 7s
800:	learn: 0.6939225	test: 0.6707104	best: 0.6707104 (800)	total: 53s	remaining: 7m 56s
1000:	learn: 0.6989170	test: 0.6720231	best: 0.6720231 (1000)	total: 1m 6s	remaining: 7m 44s
1200:	learn: 0.7037336	test: 0.6728841	best: 0.6728854 (1198)	total: 1m 19s	remaining: 7m 31s
1400:	learn: 0.7091086	test: 0.6736824	best: 0.6736842 (1399)	total: 1m 32s	remaining: 7m 17s
1600:	learn: 0.7141840	test: 0.6741962	best: 0.6742040 (1592)	total: 1m 46s	remaining: 7m 4s
1800:	learn: 0.7192296	test: 0.6745498	best: 0.6745498 (1800)	total: 1m 59s	remaining: 6m 50s
20

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:14<00:00,  1.26s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6236358	test: 0.6236250	best: 0.6236250 (0)	total: 72.3ms	remaining: 9m 38s
200:	learn: 0.6682518	test: 0.6612312	best: 0.6612312 (200)	total: 13.3s	remaining: 8m 37s
400:	learn: 0.6800122	test: 0.6693476	best: 0.6693476 (400)	total: 26.8s	remaining: 8m 27s
600:	learn: 0.6873857	test: 0.6728401	best: 0.6728428 (599)	total: 40s	remaining: 8m 12s
800:	learn: 0.6932629	test: 0.6750777	best: 0.6750777 (800)	total: 53.5s	remaining: 8m
1000:	learn: 0.6981879	test: 0.6766108	best: 0.6766108 (1000)	total: 1m 6s	remaining: 7m 47s
1200:	learn: 0.7030538	test: 0.6777807	best: 0.6777947 (1198)	total: 1m 19s	remaining: 7m 32s
1400:	learn: 0.7082963	test: 0.6787973	best: 0.6787973 (1400)	total: 1m 33s	remaining: 7m 18s
1600:	learn: 0.7135230	test: 0.6797041	best: 0.6797051 (1597)	total: 1m 46s	remaining: 7m 4s
1800:	learn: 0.7184491	test: 0.6805517	best: 0.6805517 (1800)	total: 1m 59s	remaining: 6m 51s
2000:

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.16s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6218741	test: 0.6276419	best: 0.6276419 (0)	total: 70.2ms	remaining: 9m 21s
200:	learn: 0.6683664	test: 0.6713974	best: 0.6713974 (200)	total: 13.1s	remaining: 8m 28s
400:	learn: 0.6790777	test: 0.6789909	best: 0.6789909 (400)	total: 26.6s	remaining: 8m 23s
600:	learn: 0.6865049	test: 0.6825891	best: 0.6825891 (600)	total: 40.1s	remaining: 8m 13s
800:	learn: 0.6924367	test: 0.6845918	best: 0.6845918 (800)	total: 53.4s	remaining: 7m 59s
1000:	learn: 0.6974672	test: 0.6859041	best: 0.6859041 (1000)	total: 1m 6s	remaining: 7m 48s
1200:	learn: 0.7022553	test: 0.6869625	best: 0.6869725 (1196)	total: 1m 20s	remaining: 7m 33s
1400:	learn: 0.7073578	test: 0.6879377	best: 0.6879377 (1400)	total: 1m 33s	remaining: 7m 20s
1600:	learn: 0.7125669	test: 0.6888199	best: 0.6888259 (1597)	total: 1m 46s	remaining: 7m 6s
1800:	learn: 0.7174982	test: 0.6893206	best: 0.6893206 (1800)	total: 1m 59s	remaining: 6m 52s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.15s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6092477	test: 0.6123891	best: 0.6123891 (0)	total: 73.6ms	remaining: 9m 48s
200:	learn: 0.6696516	test: 0.6707405	best: 0.6707405 (200)	total: 13.2s	remaining: 8m 31s
400:	learn: 0.6800690	test: 0.6779333	best: 0.6779333 (400)	total: 26.5s	remaining: 8m 22s
600:	learn: 0.6869588	test: 0.6812674	best: 0.6812674 (600)	total: 39.8s	remaining: 8m 9s
800:	learn: 0.6927359	test: 0.6834219	best: 0.6834219 (800)	total: 53s	remaining: 7m 56s
1000:	learn: 0.6977316	test: 0.6848001	best: 0.6848001 (1000)	total: 1m 6s	remaining: 7m 44s
1200:	learn: 0.7022155	test: 0.6858209	best: 0.6858209 (1200)	total: 1m 19s	remaining: 7m 30s
1400:	learn: 0.7073240	test: 0.6867415	best: 0.6867415 (1400)	total: 1m 32s	remaining: 7m 17s
1600:	learn: 0.7123931	test: 0.6876037	best: 0.6876107 (1599)	total: 1m 46s	remaining: 7m 4s
1800:	learn: 0.7170539	test: 0.6882125	best: 0.6882125 (1800)	total: 1m 59s	remaining: 6m 52s
20

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.14s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6170139	test: 0.6166282	best: 0.6166282 (0)	total: 71.7ms	remaining: 9m 33s
200:	learn: 0.6703567	test: 0.6596574	best: 0.6596574 (200)	total: 13.4s	remaining: 8m 41s
400:	learn: 0.6814033	test: 0.6671480	best: 0.6671480 (400)	total: 26.8s	remaining: 8m 27s
600:	learn: 0.6885217	test: 0.6705267	best: 0.6705267 (600)	total: 40.1s	remaining: 8m 13s
800:	learn: 0.6942319	test: 0.6723585	best: 0.6723585 (800)	total: 53.6s	remaining: 8m 1s
1000:	learn: 0.6991138	test: 0.6736081	best: 0.6736081 (1000)	total: 1m 6s	remaining: 7m 45s
1200:	learn: 0.7036986	test: 0.6746431	best: 0.6746431 (1200)	total: 1m 20s	remaining: 7m 33s
1400:	learn: 0.7086923	test: 0.6756575	best: 0.6756597 (1399)	total: 1m 33s	remaining: 7m 21s
1600:	learn: 0.7136459	test: 0.6764646	best: 0.6764689 (1599)	total: 1m 47s	remaining: 7m 8s
1800:	learn: 0.7182604	test: 0.6769197	best: 0.6769197 (1800)	total: 2m	remaining: 6m 55s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.15s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6103095	test: 0.6204197	best: 0.6204197 (0)	total: 70.7ms	remaining: 9m 25s
200:	learn: 0.6694430	test: 0.6691679	best: 0.6691679 (200)	total: 13.5s	remaining: 8m 44s
400:	learn: 0.6798427	test: 0.6745695	best: 0.6745695 (400)	total: 26.8s	remaining: 8m 28s
600:	learn: 0.6872532	test: 0.6774501	best: 0.6774501 (600)	total: 40.4s	remaining: 8m 16s
800:	learn: 0.6928539	test: 0.6790784	best: 0.6790784 (800)	total: 53.7s	remaining: 8m 2s
1000:	learn: 0.6979720	test: 0.6803040	best: 0.6803077 (999)	total: 1m 7s	remaining: 7m 48s
1200:	learn: 0.7027355	test: 0.6810215	best: 0.6810291 (1196)	total: 1m 20s	remaining: 7m 35s
1400:	learn: 0.7079271	test: 0.6817729	best: 0.6817729 (1400)	total: 1m 34s	remaining: 7m 23s
1600:	learn: 0.7131497	test: 0.6823972	best: 0.6823972 (1600)	total: 1m 47s	remaining: 7m 10s
1800:	learn: 0.7179447	test: 0.6828255	best: 0.6828255 (1800)	total: 2m 1s	remaining: 6m 58s
2

In [21]:
print("CV Score: ", np.mean(val_auc_scores))

CV Score:  0.6843123961641993


In [22]:
### Combine all CV preds for test
test_preds_cv = pd.DataFrame(np.asarray(test_preds_list).T).mean(axis=1).values

In [23]:
sample_submission['loan_default'] = test_preds_cv
#sample_submission['loan_default'] = sample_submission['loan_default'].rank(pct=True)
sample_submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.295996
1,723482,0.343816
2,758529,0.34126
3,763449,0.371873
4,708663,0.295992


In [24]:
train_oof_preds = train_df[['UniqueID', 'loan_default']].copy()
train_oof_preds['loan_default'] = all_train_predictions
train_oof_preds.head()

Unnamed: 0,UniqueID,loan_default
0,420825,0.413942
1,537409,0.347843
2,417566,0.400205
3,624493,0.422569
4,539055,0.317342


In [25]:
sample_submission.to_csv('cb3.csv', index=False)
#train_oof_preds.to_csv('cb3_TRAIN.csv', index=False)

Download Directly Without Commiting