In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from collections import Counter
import math
import random
from tqdm import tqdm

#%matplotlib inline

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing
from category_encoders import target_encoder, TargetEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler, SMOTE
import catboost as cb

pd.set_option('display.max_columns', 200)

Using TensorFlow backend.


In [3]:
def label_encoding_func(df_name, df_col_name):
    '''
    usage: dataframe[column_name] = label_encoding_function(dataframe, column_name)
    '''
    le = preprocessing.LabelEncoder()
    le.fit(df_name[df_col_name])
    return le.transform(df_name[df_col_name])

def do_one_hot_encoding(df_name, df_column_name, suffix=''):
    '''
    usage: dataframe[column_name] = do_one_hot_encoding(dataframe, column_name, suffix_for_column_name)
    '''
    x = pd.get_dummies(df_name[df_column_name])
    df_name = df_name.join(x, lsuffix=suffix)
    df_name = df_name.drop(df_column_name, axis=1) 
    return df_name

#function for perform target encoding later on
def perform_target_encoding(columns, X, Y, X_Val, X_Test):
    for i in tqdm(columns):
        target_enc = TargetEncoder(cols=[i], smoothing=3)
        target_enc_fit = target_enc.fit(X, Y)
        X[i] = target_enc.transform(X, Y)[i]
        X_Val[i] = target_enc.transform(X_Val)[i]
        X_Test[i] = target_enc.transform(X_Test)[i]
        
    return X, X_Val, X_Test

### Load Feature Engineered Datasets

In [2]:
path = '../../feature_engineering_eda_data/' 
train_file = 'train_feature_engineered_V2.csv'
test_file = 'test_feature_engineered_V2.csv'

train_df = pd.read_csv(path+train_file)
test_df = pd.read_csv(path+test_file)
sample_submission = pd.read_csv(path+'sample_submission_24jSKY6.csv')

In [3]:
train_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,...,1984,1,1,2018,8,3,0,0,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,1985,7,31,2018,9,26,1,11,1,11
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,1985,8,24,2018,8,1,0,0,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,1993,12,30,2018,10,26,0,8,1,3
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,...,1977,12,9,2018,9,26,0,0,0,0


In [4]:
test_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,...,1974,1,1,2018,11,3,0,0,0,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,...,1985,5,20,2018,11,20,0,8,1,0
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,...,1995,10,14,2018,11,29,0,0,0,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,...,1973,6,1,2018,11,29,2,5,2,5
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,...,1972,6,1,2018,11,17,0,0,0,0


### Combine Train and Test

In [7]:
X_train = train_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
X_test = test_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
Y = train_df['loan_default']

### Concat train and test for common preprocessing
concat_df = pd.concat([X_train, X_test], keys=['train', 'test'])

In [8]:
concat_df.head()

Unnamed: 0,Unnamed: 1,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
train,0,1,1441,0,01-01-84,03-08-18,0,1998,Salaried,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,58400,67,50578,89.55,45,22807
train,1,1,1502,1,31-07-85,26-09-18,0,1998,Self employed,1,11,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,0,0,0,598,I-Medium Risk,1,27600,50200,1,1,50200,1991,0,0,0,0,0,0,0,0,6,0,65550,67,47145,73.23,45,22807
train,2,1,1497,0,24-08-85,01-08-18,0,1998,Self employed,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,61360,67,53278,89.63,45,22807
train,3,1,1501,0,30-12-93,26-10-18,0,1998,Self employed,1,3,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,0,1,0,305,L-Very High Risk,0,0,0,3,0,0,31,0,0,0,0,0,0,0,0,6,0,66113,67,57513,88.48,45,22807
train,4,1,1495,0,09-12-77,26-09-18,0,1998,Self employed,0,0,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,1,0,1,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,60300,67,52378,88.39,45,22807


In [9]:
#filling NAs with 0
concat_df.isna().sum(axis=0).reset_index().T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
index,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### More Features

In [10]:
concat_df['employee_id_branch_id'] = concat_df['branch_id'].apply(str)+"-"+concat_df['Employee_code_ID'].apply(str)

bins = [-np.inf, 20, 25, 30, 35, 40, 45, 50, np.inf]
labels = [1,2,3,4,5,6,7,8]

concat_df['F6_age_bins'] = np.asarray(pd.cut(concat_df['F6_age_at_disbursal'], bins=bins, labels=labels).values)

In [11]:
concat_df['F10.3_CREDIT.HIST_DAYS'] = (concat_df['F10.1_CREDIT.HIST_Y'] * 365) +\
                                        (concat_df['F10.2_CREDIT.HIST_M'] * 30)

concat_df['F9.3_AVG.ACCT.AGE_DAYS'] = (concat_df['F9.1_AVG.ACCT.AGE_Y'] * 365) +\
                                        (concat_df['F9.2_AVG.ACCT.AGE_M'] * 30)

concat_df['BalancePerActiveAccount'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRI.ACTIVE.ACCTS']

concat_df['PRI.NoOfInstallmentsLeft'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRIMARY.INSTAL.AMT']

concat_df['Disbursed_CurrentBalance_Diff'] = concat_df['disbursed_amount'] - concat_df['PRI.CURRENT.BALANCE']

#combine primary and secondary values
concat_df['TotalInstallAmt'] = concat_df['PRIMARY.INSTAL.AMT'] + concat_df['SEC.INSTAL.AMT']
concat_df['TotalDisbAmt'] = concat_df['PRI.DISBURSED.AMOUNT'] + concat_df['SEC.DISBURSED.AMOUNT']
concat_df['TotalCurrentBalance'] = concat_df['PRI.CURRENT.BALANCE'] + concat_df['SEC.CURRENT.BALANCE']
concat_df['TotalActiveAccts'] = concat_df['PRI.ACTIVE.ACCTS'] + concat_df['SEC.ACTIVE.ACCTS']
concat_df['TotalOverdueAccts'] = concat_df['PRI.OVERDUE.ACCTS'] + concat_df['SEC.OVERDUE.ACCTS']
concat_df['TotalAccts'] = concat_df['PRI.NO.OF.ACCTS'] + concat_df['SEC.NO.OF.ACCTS']
concat_df['TotalSancAmt'] = concat_df['PRI.SANCTIONED.AMOUNT'] + concat_df['SEC.SANCTIONED.AMOUNT']

### Dealing with NAs and Label Encoding Categorical features

In [12]:
#### replace nulls as a new category
concat_df['Employment.Type'].fillna('NA', inplace=True)

concat_df.fillna(0, inplace=True)

### replace -inf and +inf with 0

#filling infs
for i in concat_df.columns.values:
    if (len(concat_df.loc[concat_df[i] == np.inf, i]) != 0)or(len(concat_df.loc[concat_df[i] == -np.inf, i]) != 0):
        print(i)
        concat_df.loc[concat_df[i] == np.inf, i] = 0
        concat_df.loc[concat_df[i] == -np.inf, i] = 0

PRI.NoOfInstallmentsLeft


In [13]:
def label_encode_apply(df):
    if df[0] == object:
        concat_df[df['index']] = label_encoding_func(concat_df, df['index'])
        
_ = concat_df.dtypes.reset_index().apply(label_encode_apply, axis=1)
print('Done')

Done


### Split Train Test

In [14]:
X_train = concat_df.loc['train']
X_test = concat_df.loc['test']

### Yet More Features

In [15]:
### this function is for running inside cv

def generate_summed_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].sum().reset_index()
    state = train.groupby('State_ID')[variable].sum().reset_index()
    supplier = train.groupby('supplier_id')[variable].sum().reset_index()
    branch = train.groupby('branch_id')[variable].sum().reset_index()
    
    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_sum_pincode_F11.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], state, how='left', on='State_ID', 
         suffixes=('', '_sum_state_F11.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier, how='left', on='supplier_id', 
         suffixes=('', '_sum_supplier_F11.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_sum_branch_F11.4'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  


#### this function is for running inside cv

def generate_averaged_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].mean().reset_index()
    branch = train.groupby('branch_id')[variable].mean().reset_index()
    employee_code_id = train.groupby('Employee_code_ID')[variable].mean().reset_index()
    employee_code_id_branch_id = train.groupby('employee_id_branch_id')[variable].mean().reset_index()
    supplier_id = train.groupby('supplier_id')[variable].mean().reset_index()

    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_mean_pincode_F13.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_mean_branch_F13.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id, how='left', on='Employee_code_ID', 
         suffixes=('', '_mean_employeeid_F13.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id_branch_id, how='left',on='employee_id_branch_id', 
         suffixes=('', '_mean_employee_id_branch_id_F13.4'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier_id, how='left', on='supplier_id', 
         suffixes=('', '_mean_supplier_id_F13.5'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  

### Specify cols to TE inside CV

In [16]:
cols_to_target_encode = ['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'F6_age_bins',
                        'manufacturer_id', 'State_ID', 'Employment.Type', 'PRI.OVERDUE.ACCTS',
                        'PRI.ACTIVE.ACCTS', 'F7.1_DOB_Y', 'PERFORM_CNS.SCORE.DESCRIPTION', 'SEC.NO.OF.ACCTS',
                        'NO.OF_INQUIRIES', 'NEW.ACCTS.IN.LAST.SIX.MONTHS']

### Model with 10 Fold CV

In [17]:
strf_split = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [18]:
train_preds_list_oof_semi_stacking = []
val_auc_scores = []
test_preds_list = []

all_train_predictions = np.zeros([X_train.shape[0]])

cv_counter = 1

for train_idx, val_idx in strf_split.split(X_train, Y):
    print("***************** ", cv_counter, " *****************", end="\n\n")
    
    t_x = X_train.iloc[train_idx]
    v_x = X_train.iloc[val_idx]
    
    t_y = Y[train_idx]
    v_y = Y[val_idx]
    
    test_x = X_test.copy()

    print('Generating New Features: ')
    #generate new features
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'ltv')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.CURRENT.BALANCE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'disbursed_amount')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'asset_cost')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRIMARY.INSTAL.AMT')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PERFORM_CNS.SCORE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F4_avg_primary_disbursed_amt')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F2_difference_asset_disbursed')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.NO.OF.ACCTS')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F6_age_at_disbursal')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalCurrentBalance')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalInstallAmt')
    
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.ACTIVE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.OVERDUE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'NO.OF_INQUIRIES')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalActiveAccts')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalOverdueAccts')
    
    
    print('Target Encoding: ')
    t_x, v_x, test_x = perform_target_encoding(cols_to_target_encode, t_x, t_y, v_x, test_x)
    
    
    print("Train Shape: ", t_x.shape)
    print("Val Shape: ", v_x.shape)
    print("Test Shape: ", test_x.shape, end="\n\n")
    
    params = {
            'eval_metric': 'AUC',
            'learning_rate': 0.01,
            'random_seed': 12321,
            'l2_leaf_reg': 15,
            'bootstrap_type': 'Bernoulli',
            #'bagging_temperature': 0.3,
            'subsample': 0.5,
            'max_depth': 8,
            'feature_border_type': 'MinEntropy',
            'thread_count': 4, 
            'objective': 'CrossEntropy',
            #'min_data_in_leaf': 100,
            'task_type': 'GPU',
            'od_type': 'Iter'
        }

    dtrain = cb.Pool(t_x, label=t_y)
    dvalid = cb.Pool(v_x, label=v_y)
        
    model = cb.train(dtrain=dtrain, params = params, num_boost_round=8000, eval_set=[dvalid], early_stopping_rounds=500, 
        verbose_eval=200) 
    
    val_preds = model.predict(v_x, prediction_type='Probability')
    val_score = roc_auc_score(v_y, val_preds[:,1])
    
    print(val_score)
    
    val_auc_scores.append(val_score)
    
        
    test_preds = model.predict(test_x, prediction_type='Probability')
    test_preds_list.append(test_preds[:,1])
    
    all_train_predictions[val_idx] = val_preds[:,1]
    
    cv_counter+=1
    
    print("============"*8, end="\n\n")

*****************  1  *****************

Generating New Features: 


  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.21s/it]


Train Shape:  (209837, 151)
Val Shape:  (23317, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6200511	test: 0.6234094	best: 0.6234094 (0)	total: 28.5ms	remaining: 3m 48s
200:	learn: 0.6651630	test: 0.6612206	best: 0.6612206 (200)	total: 4.71s	remaining: 3m 2s
400:	learn: 0.6788492	test: 0.6693286	best: 0.6693286 (400)	total: 9.33s	remaining: 2m 56s
600:	learn: 0.6878679	test: 0.6737022	best: 0.6737022 (600)	total: 13.6s	remaining: 2m 47s
800:	learn: 0.6949486	test: 0.6761867	best: 0.6761902 (796)	total: 18s	remaining: 2m 41s
1000:	learn: 0.7009902	test: 0.6780118	best: 0.6780118 (1000)	total: 22.4s	remaining: 2m 36s
1200:	learn: 0.7067945	test: 0.6793547	best: 0.6793727 (1198)	total: 26.8s	remaining: 2m 31s
1400:	learn: 0.7126751	test: 0.6805259	best: 0.6805259 (1400)	total: 31.1s	remaining: 2m 26s
1600:	learn: 0.7182318	test: 0.6815043	best: 0.6815057 (1599)	total: 35.5s	remaining: 2m 22s
1800:	learn: 0.7234354	test: 0.6823063	best: 0.6823101 (1799)	total: 39.8s	remaining: 2m 16s
2000:

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.16s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6197666	test: 0.6247470	best: 0.6247470 (0)	total: 24.3ms	remaining: 3m 14s
200:	learn: 0.6648333	test: 0.6599276	best: 0.6599276 (200)	total: 4.25s	remaining: 2m 45s
400:	learn: 0.6785788	test: 0.6677468	best: 0.6677468 (400)	total: 8.59s	remaining: 2m 42s
600:	learn: 0.6878975	test: 0.6714896	best: 0.6714991 (599)	total: 13s	remaining: 2m 39s
800:	learn: 0.6951603	test: 0.6737181	best: 0.6737181 (800)	total: 17.1s	remaining: 2m 34s
1000:	learn: 0.7012167	test: 0.6752277	best: 0.6752277 (1000)	total: 21.3s	remaining: 2m 29s
1200:	learn: 0.7070400	test: 0.6766563	best: 0.6766563 (1200)	total: 25.6s	remaining: 2m 24s
1400:	learn: 0.7130347	test: 0.6780432	best: 0.6780432 (1400)	total: 29.8s	remaining: 2m 20s
1600:	learn: 0.7187302	test: 0.6792809	best: 0.6792935 (1597)	total: 34s	remaining: 2m 15s
1800:	learn: 0.7240337	test: 0.6801735	best: 0.6801820 (1799)	total: 38.3s	remaining: 2m 11s
2000:	

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:13<00:00,  1.18s/it]


Train Shape:  (209838, 151)
Val Shape:  (23316, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6248910	test: 0.6158977	best: 0.6158977 (0)	total: 22.9ms	remaining: 3m 3s
200:	learn: 0.6654758	test: 0.6506472	best: 0.6506472 (200)	total: 4.15s	remaining: 2m 41s
400:	learn: 0.6791622	test: 0.6594041	best: 0.6594041 (400)	total: 8.4s	remaining: 2m 39s
600:	learn: 0.6882991	test: 0.6635870	best: 0.6635870 (600)	total: 12.6s	remaining: 2m 35s
800:	learn: 0.6956261	test: 0.6661237	best: 0.6661237 (800)	total: 16.8s	remaining: 2m 31s
1000:	learn: 0.7017831	test: 0.6679093	best: 0.6679093 (1000)	total: 21.1s	remaining: 2m 27s
1200:	learn: 0.7074297	test: 0.6693172	best: 0.6693172 (1200)	total: 25.2s	remaining: 2m 22s
1400:	learn: 0.7134356	test: 0.6705768	best: 0.6705768 (1400)	total: 29.4s	remaining: 2m 18s
1600:	learn: 0.7190811	test: 0.6716033	best: 0.6716086 (1599)	total: 33.7s	remaining: 2m 14s
1800:	learn: 0.7244341	test: 0.6723019	best: 0.6723120 (1799)	total: 37.9s	remaining: 2m 10s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.11s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6185319	test: 0.6156911	best: 0.6156911 (0)	total: 23.1ms	remaining: 3m 4s
200:	learn: 0.6654585	test: 0.6530107	best: 0.6530107 (200)	total: 4.16s	remaining: 2m 41s
400:	learn: 0.6792197	test: 0.6619990	best: 0.6619990 (400)	total: 8.33s	remaining: 2m 37s
600:	learn: 0.6881024	test: 0.6666924	best: 0.6666924 (600)	total: 12.5s	remaining: 2m 33s
800:	learn: 0.6953368	test: 0.6697260	best: 0.6697260 (800)	total: 16.6s	remaining: 2m 29s
1000:	learn: 0.7010471	test: 0.6714876	best: 0.6714876 (1000)	total: 20.8s	remaining: 2m 25s
1200:	learn: 0.7066501	test: 0.6730037	best: 0.6730037 (1200)	total: 25.1s	remaining: 2m 22s
1400:	learn: 0.7125169	test: 0.6743263	best: 0.6743263 (1400)	total: 29.2s	remaining: 2m 17s
1600:	learn: 0.7181862	test: 0.6754381	best: 0.6754381 (1600)	total: 33.4s	remaining: 2m 13s
1800:	learn: 0.7235355	test: 0.6764484	best: 0.6764484 (1800)	total: 37.8s	remaining: 2m 9s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.12s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6241571	test: 0.6174167	best: 0.6174167 (0)	total: 23.8ms	remaining: 3m 10s
200:	learn: 0.6655173	test: 0.6495858	best: 0.6495858 (200)	total: 4.39s	remaining: 2m 50s
400:	learn: 0.6794738	test: 0.6572013	best: 0.6572013 (400)	total: 8.8s	remaining: 2m 46s
600:	learn: 0.6887871	test: 0.6611588	best: 0.6611588 (600)	total: 13.1s	remaining: 2m 41s
800:	learn: 0.6962254	test: 0.6635902	best: 0.6635902 (800)	total: 17.4s	remaining: 2m 36s
1000:	learn: 0.7021936	test: 0.6650304	best: 0.6650304 (1000)	total: 21.9s	remaining: 2m 33s
1200:	learn: 0.7079751	test: 0.6662468	best: 0.6662468 (1200)	total: 26.5s	remaining: 2m 29s
1400:	learn: 0.7138248	test: 0.6674808	best: 0.6674896 (1398)	total: 30.8s	remaining: 2m 25s
1600:	learn: 0.7194169	test: 0.6684750	best: 0.6684750 (1600)	total: 35s	remaining: 2m 19s
1800:	learn: 0.7248656	test: 0.6692920	best: 0.6692920 (1800)	total: 39.2s	remaining: 2m 14s
2000:

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.07s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6222034	test: 0.6136330	best: 0.6136330 (0)	total: 24.7ms	remaining: 3m 17s
200:	learn: 0.6647982	test: 0.6504371	best: 0.6504371 (200)	total: 4.45s	remaining: 2m 52s
400:	learn: 0.6786403	test: 0.6605702	best: 0.6605727 (398)	total: 8.85s	remaining: 2m 47s
600:	learn: 0.6877613	test: 0.6657453	best: 0.6657453 (600)	total: 13.1s	remaining: 2m 40s
800:	learn: 0.6950016	test: 0.6688274	best: 0.6688290 (798)	total: 17.6s	remaining: 2m 37s
1000:	learn: 0.7016847	test: 0.6710682	best: 0.6710682 (1000)	total: 22s	remaining: 2m 34s
1200:	learn: 0.7074810	test: 0.6727944	best: 0.6727993 (1199)	total: 26.4s	remaining: 2m 29s
1400:	learn: 0.7135319	test: 0.6743779	best: 0.6743779 (1400)	total: 30.6s	remaining: 2m 23s
1600:	learn: 0.7194160	test: 0.6756426	best: 0.6756521 (1599)	total: 34.7s	remaining: 2m 18s
1800:	learn: 0.7248396	test: 0.6768769	best: 0.6768769 (1800)	total: 38.9s	remaining: 2m 13s
2000

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:11<00:00,  1.05s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6243123	test: 0.6195177	best: 0.6195177 (0)	total: 22.5ms	remaining: 3m
200:	learn: 0.6641751	test: 0.6596697	best: 0.6596697 (200)	total: 4.07s	remaining: 2m 38s
400:	learn: 0.6776375	test: 0.6688667	best: 0.6688667 (400)	total: 8.19s	remaining: 2m 35s
600:	learn: 0.6871091	test: 0.6737795	best: 0.6737795 (600)	total: 12.2s	remaining: 2m 30s
800:	learn: 0.6944698	test: 0.6768059	best: 0.6768059 (800)	total: 16.3s	remaining: 2m 26s
1000:	learn: 0.7006592	test: 0.6787988	best: 0.6787988 (1000)	total: 20.4s	remaining: 2m 22s
1200:	learn: 0.7065260	test: 0.6803018	best: 0.6803018 (1200)	total: 24.6s	remaining: 2m 19s
1400:	learn: 0.7126419	test: 0.6817004	best: 0.6817025 (1399)	total: 28.7s	remaining: 2m 14s
1600:	learn: 0.7183993	test: 0.6830257	best: 0.6830292 (1599)	total: 32.8s	remaining: 2m 10s
1800:	learn: 0.7238208	test: 0.6840580	best: 0.6840580 (1800)	total: 36.8s	remaining: 2m 6s
2000:	l

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.07s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6201248	test: 0.6258004	best: 0.6258004 (0)	total: 24.2ms	remaining: 3m 13s
200:	learn: 0.6648856	test: 0.6631112	best: 0.6631112 (200)	total: 4.06s	remaining: 2m 37s
400:	learn: 0.6774234	test: 0.6712472	best: 0.6712588 (399)	total: 8.06s	remaining: 2m 32s
600:	learn: 0.6868684	test: 0.6762663	best: 0.6762663 (600)	total: 12.2s	remaining: 2m 30s
800:	learn: 0.6941569	test: 0.6788850	best: 0.6788850 (800)	total: 16.4s	remaining: 2m 27s
1000:	learn: 0.7005360	test: 0.6805381	best: 0.6805381 (1000)	total: 20.7s	remaining: 2m 25s
1200:	learn: 0.7064373	test: 0.6817257	best: 0.6817257 (1200)	total: 24.9s	remaining: 2m 20s
1400:	learn: 0.7124090	test: 0.6828483	best: 0.6828535 (1395)	total: 29s	remaining: 2m 16s
1600:	learn: 0.7180558	test: 0.6839795	best: 0.6839876 (1599)	total: 33.3s	remaining: 2m 12s
1800:	learn: 0.7232532	test: 0.6847014	best: 0.6847030 (1799)	total: 37.4s	remaining: 2m 8s
2000:

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.07s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6255999	test: 0.6164228	best: 0.6164228 (0)	total: 24.8ms	remaining: 3m 18s
200:	learn: 0.6662043	test: 0.6521844	best: 0.6521844 (200)	total: 4.14s	remaining: 2m 40s
400:	learn: 0.6796287	test: 0.6613175	best: 0.6613175 (400)	total: 8.41s	remaining: 2m 39s
600:	learn: 0.6886749	test: 0.6661730	best: 0.6661730 (600)	total: 12.5s	remaining: 2m 33s
800:	learn: 0.6957005	test: 0.6689369	best: 0.6689369 (800)	total: 16.6s	remaining: 2m 29s
1000:	learn: 0.7017830	test: 0.6707254	best: 0.6707254 (1000)	total: 20.7s	remaining: 2m 24s
1200:	learn: 0.7075810	test: 0.6721906	best: 0.6721906 (1199)	total: 24.9s	remaining: 2m 20s
1400:	learn: 0.7133887	test: 0.6734710	best: 0.6734710 (1400)	total: 29s	remaining: 2m 16s
1600:	learn: 0.7190565	test: 0.6745028	best: 0.6745028 (1600)	total: 33s	remaining: 2m 12s
1800:	learn: 0.7243270	test: 0.6754175	best: 0.6754175 (1800)	total: 37.2s	remaining: 2m 8s
2000:	l

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:12<00:00,  1.06s/it]


Train Shape:  (209839, 151)
Val Shape:  (23315, 151)
Test Shape:  (112392, 151)

0:	learn: 0.6209542	test: 0.6114960	best: 0.6114960 (0)	total: 25.1ms	remaining: 3m 20s
200:	learn: 0.6655903	test: 0.6561079	best: 0.6561079 (200)	total: 4.23s	remaining: 2m 44s
400:	learn: 0.6784110	test: 0.6634596	best: 0.6634596 (400)	total: 8.32s	remaining: 2m 37s
600:	learn: 0.6878439	test: 0.6680203	best: 0.6680203 (600)	total: 12.4s	remaining: 2m 32s
800:	learn: 0.6951737	test: 0.6708962	best: 0.6708962 (800)	total: 16.4s	remaining: 2m 27s
1000:	learn: 0.7012811	test: 0.6728638	best: 0.6728638 (1000)	total: 20.7s	remaining: 2m 24s
1200:	learn: 0.7071299	test: 0.6743004	best: 0.6743004 (1200)	total: 24.9s	remaining: 2m 20s
1400:	learn: 0.7130023	test: 0.6757160	best: 0.6757160 (1400)	total: 29s	remaining: 2m 16s
1600:	learn: 0.7187612	test: 0.6771457	best: 0.6771457 (1600)	total: 33.2s	remaining: 2m 12s
1800:	learn: 0.7242668	test: 0.6782585	best: 0.6782585 (1800)	total: 37.3s	remaining: 2m 8s
2000:

In [19]:
print("CV Score: ", np.mean(val_auc_scores))

CV Score:  0.6825354419818798


In [20]:
### Combine all CV preds for test
test_preds_cv = pd.DataFrame(np.asarray(test_preds_list).T).mean(axis=1).values

In [21]:
sample_submission['loan_default'] = test_preds_cv
#sample_submission['loan_default'] = sample_submission['loan_default'].rank(pct=True)
sample_submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.275
1,723482,0.36038
2,758529,0.318846
3,763449,0.380955
4,708663,0.299178


In [22]:
train_oof_preds = train_df[['UniqueID', 'loan_default']].copy()
train_oof_preds['loan_default'] = all_train_predictions
train_oof_preds.head()

Unnamed: 0,UniqueID,loan_default
0,420825,0.410926
1,537409,0.372199
2,417566,0.407234
3,624493,0.45718
4,539055,0.272453


In [23]:
sample_submission.to_csv('cb1.csv', index=False)
#train_oof_preds.to_csv('cb1_TRAIN.csv', index=False)