In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from collections import Counter
import math
import random
from tqdm import tqdm
from datetime import datetime, timedelta, date

#%matplotlib inline

In [2]:
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing
from category_encoders import target_encoder, TargetEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler, SMOTE
import catboost as cb

pd.set_option('display.max_columns', 200)

Using TensorFlow backend.


In [3]:
def label_encoding_func(df_name, df_col_name):
    '''
    usage: dataframe[column_name] = label_encoding_function(dataframe, column_name)
    '''
    le = preprocessing.LabelEncoder()
    le.fit(df_name[df_col_name])
    return le.transform(df_name[df_col_name])

def do_one_hot_encoding(df_name, df_column_name, suffix=''):
    '''
    usage: dataframe[column_name] = do_one_hot_encoding(dataframe, column_name, suffix_for_column_name)
    '''
    x = pd.get_dummies(df_name[df_column_name])
    df_name = df_name.join(x, lsuffix=suffix)
    df_name = df_name.drop(df_column_name, axis=1) 
    return df_name

#function for perform target encoding later on
def perform_target_encoding(columns, X, Y, X_Val, X_Test):
    for i in tqdm(columns):
        target_enc = TargetEncoder(cols=[i], smoothing=3)
        target_enc_fit = target_enc.fit(X, Y)
        X[i] = target_enc.transform(X, Y)[i]
        X_Val[i] = target_enc.transform(X_Val)[i]
        X_Test[i] = target_enc.transform(X_Test)[i]
        
    return X, X_Val, X_Test

### Load Feature Engineered Datasets

In [2]:
path = '../../feature_engineering_eda_data/' 
train_file = 'train_feature_engineered_V2.csv'
test_file = 'test_feature_engineered_V2.csv'

train_df = pd.read_csv(path+train_file)
test_df = pd.read_csv(path+test_file)
sample_submission = pd.read_csv(path+'sample_submission_24jSKY6.csv')

### Combine Train and Test

In [7]:
X_train = train_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
X_test = test_df[train_df.columns.difference(['loan_default', 'UniqueID'])]
Y = train_df['loan_default']

### Concat train and test for common preprocessing
concat_df = pd.concat([X_train, X_test], keys=['train', 'test'])

In [8]:
concat_df.head()

Unnamed: 0,Unnamed: 1,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
train,0,1,1441,0,01-01-84,03-08-18,0,1998,Salaried,0,0,86.606164,7822,0.0,0.0,0.0,34.610959,1984,1,1,2018,8,3,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,58400,67,50578,89.55,45,22807
train,1,1,1502,1,31-07-85,26-09-18,0,1998,Self employed,1,11,71.922197,18405,50200.0,50200.0,1.0,33.178082,1985,7,31,2018,9,26,1,11,1,0,0,0,598,I-Medium Risk,1,27600,50200,1,1,50200,1991,0,0,0,0,0,0,0,0,6,0,65550,67,47145,73.23,45,22807
train,2,1,1497,0,24-08-85,01-08-18,0,1998,Self employed,0,0,86.828553,8082,0.0,0.0,0.0,32.958904,1985,8,24,2018,8,1,0,0,1,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,61360,67,53278,89.63,45,22807
train,3,1,1501,0,30-12-93,26-10-18,0,1998,Self employed,1,3,86.991968,8600,0.0,0.0,0.0,24.838356,1993,12,30,2018,10,26,0,8,1,0,1,0,305,L-Very High Risk,0,0,0,3,0,0,31,0,0,0,0,0,0,0,0,6,0,66113,67,57513,88.48,45,22807
train,4,1,1495,0,09-12-77,26-09-18,0,1998,Self employed,0,0,86.862355,7922,0.0,0.0,0.0,40.824658,1977,12,9,2018,9,26,0,0,1,0,1,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,60300,67,52378,88.39,45,22807


In [9]:
#filling NAs with 0
concat_df.isna().sum(axis=0).reset_index().T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
index,Aadhar_flag,Current_pincode_ID,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,Date.of.Birth,DisbursalDate,Driving_flag,Employee_code_ID,Employment.Type,F10.1_CREDIT.HIST_Y,F10.2_CREDIT.HIST_M,F1_Manual_LTV,F2_difference_asset_disbursed,F3_avg_primary_sanctioned_amt,F4_avg_primary_disbursed_amt,F5_ratio_primary_active_overdue,F6_age_at_disbursal,F7.1_DOB_Y,F7.2_DOB_M,F7.3_DOB_D,F8.1_DisDate_Y,F8.2_DisDate_M,F8.3_DisDate_D,F9.1_AVG.ACCT.AGE_Y,F9.2_AVG.ACCT.AGE_M,MobileNo_Avl_Flag,NEW.ACCTS.IN.LAST.SIX.MONTHS,NO.OF_INQUIRIES,PAN_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.ACTIVE.ACCTS,PRI.CURRENT.BALANCE,PRI.DISBURSED.AMOUNT,PRI.NO.OF.ACCTS,PRI.OVERDUE.ACCTS,PRI.SANCTIONED.AMOUNT,PRIMARY.INSTAL.AMT,Passport_flag,SEC.ACTIVE.ACCTS,SEC.CURRENT.BALANCE,SEC.DISBURSED.AMOUNT,SEC.INSTAL.AMT,SEC.NO.OF.ACCTS,SEC.OVERDUE.ACCTS,SEC.SANCTIONED.AMOUNT,State_ID,VoterID_flag,asset_cost,branch_id,disbursed_amount,ltv,manufacturer_id,supplier_id
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### More Features

In [10]:
concat_df['employee_id_branch_id'] = concat_df['branch_id'].apply(str)+"-"+concat_df['Employee_code_ID'].apply(str)

bins = [-np.inf, 20, 25, 30, 35, 40, 45, 50, np.inf]
labels = [1,2,3,4,5,6,7,8]

concat_df['F6_age_bins'] = np.asarray(pd.cut(concat_df['F6_age_at_disbursal'], bins=bins, labels=labels).values)

In [11]:
concat_df['F10.3_CREDIT.HIST_DAYS'] = (concat_df['F10.1_CREDIT.HIST_Y'] * 365) +\
                                        (concat_df['F10.2_CREDIT.HIST_M'] * 30)

concat_df['F9.3_AVG.ACCT.AGE_DAYS'] = (concat_df['F9.1_AVG.ACCT.AGE_Y'] * 365) +\
                                        (concat_df['F9.2_AVG.ACCT.AGE_M'] * 30)

concat_df['BalancePerActiveAccount'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRI.ACTIVE.ACCTS']

concat_df['PRI.NoOfInstallmentsLeft'] = concat_df['PRI.CURRENT.BALANCE']/concat_df['PRIMARY.INSTAL.AMT']

concat_df['Disbursed_CurrentBalance_Diff'] = concat_df['disbursed_amount'] - concat_df['PRI.CURRENT.BALANCE']

#combine primary and secondary values
concat_df['TotalInstallAmt'] = concat_df['PRIMARY.INSTAL.AMT'] + concat_df['SEC.INSTAL.AMT']
concat_df['TotalDisbAmt'] = concat_df['PRI.DISBURSED.AMOUNT'] + concat_df['SEC.DISBURSED.AMOUNT']
concat_df['TotalCurrentBalance'] = concat_df['PRI.CURRENT.BALANCE'] + concat_df['SEC.CURRENT.BALANCE']
concat_df['TotalActiveAccts'] = concat_df['PRI.ACTIVE.ACCTS'] + concat_df['SEC.ACTIVE.ACCTS']
concat_df['TotalOverdueAccts'] = concat_df['PRI.OVERDUE.ACCTS'] + concat_df['SEC.OVERDUE.ACCTS']
concat_df['TotalAccts'] = concat_df['PRI.NO.OF.ACCTS'] + concat_df['SEC.NO.OF.ACCTS']
concat_df['TotalSancAmt'] = concat_df['PRI.SANCTIONED.AMOUNT'] + concat_df['SEC.SANCTIONED.AMOUNT']

In [12]:
def string_to_datetime_delta(string_val):
    date_obj = datetime.strptime(string_val, '%d-%m-%y').date()
    delta = date(2019, 1, 1) - date_obj
    return delta.days

concat_df['DaysFromDisbursalDate'] = concat_df['DisbursalDate'].apply(string_to_datetime_delta)

### Dealing with NAs and Label Encoding Categorical features

In [13]:
#### replace nulls as a new category
concat_df['Employment.Type'].fillna('NA', inplace=True)

concat_df.fillna(0, inplace=True)

### replace -inf and +inf with 0

#filling infs
for i in concat_df.columns.values:
    if (len(concat_df.loc[concat_df[i] == np.inf, i]) != 0)or(len(concat_df.loc[concat_df[i] == -np.inf, i]) != 0):
        print(i)
        concat_df.loc[concat_df[i] == np.inf, i] = 0
        concat_df.loc[concat_df[i] == -np.inf, i] = 0

PRI.NoOfInstallmentsLeft


In [14]:
def label_encode_apply(df):
    if df[0] == object:
        concat_df[df['index']] = label_encoding_func(concat_df, df['index'])
        
_ = concat_df.dtypes.reset_index().apply(label_encode_apply, axis=1)
print('Done')

Done


### Split Train Test

In [15]:
X_train = concat_df.loc['train']
X_test = concat_df.loc['test']

### Yet More Features

In [16]:
### this function is for running inside cv

def generate_summed_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].sum().reset_index()
    state = train.groupby('State_ID')[variable].sum().reset_index()
    supplier = train.groupby('supplier_id')[variable].sum().reset_index()
    branch = train.groupby('branch_id')[variable].sum().reset_index()
    manufacturer = train.groupby('manufacturer_id')[variable].sum().reset_index()
    
    
    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_sum_pincode_F11.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], state, how='left', on='State_ID', 
         suffixes=('', '_sum_state_F11.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier, how='left', on='supplier_id', 
         suffixes=('', '_sum_supplier_F11.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_sum_branch_F11.4'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], manufacturer, how='left', on='manufacturer_id', 
         suffixes=('', '_sum_manufacturer_F11.5'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  


#### this function is for running inside cv

def generate_averaged_features(train, val, test, variable=''):
    '''
    function to generate new features inside cv
    '''
    pincode = train.groupby('Current_pincode_ID')[variable].mean().reset_index()
    branch = train.groupby('branch_id')[variable].mean().reset_index()
    employee_code_id = train.groupby('Employee_code_ID')[variable].mean().reset_index()
    employee_code_id_branch_id = train.groupby('employee_id_branch_id')[variable].mean().reset_index()
    supplier_id = train.groupby('supplier_id')[variable].mean().reset_index()
    manufacturer_id = train.groupby('manufacturer_id')[variable].mean().reset_index()

    list_of_dfs = [train, val, test]
    
    for i in range(len(list_of_dfs)):
        list_of_dfs[i] = pd.merge(list_of_dfs[i], pincode, how='left', on='Current_pincode_ID', 
         suffixes=('', '_mean_pincode_F13.1'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], branch, how='left', on='branch_id', 
         suffixes=('', '_mean_branch_F13.2'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id, how='left', on='Employee_code_ID', 
         suffixes=('', '_mean_employeeid_F13.3'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], employee_code_id_branch_id, how='left',on='employee_id_branch_id', 
         suffixes=('', '_mean_employee_id_branch_id_F13.4'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], supplier_id, how='left', on='supplier_id', 
         suffixes=('', '_mean_supplier_id_F13.5'))
        list_of_dfs[i] = pd.merge(list_of_dfs[i], manufacturer_id, how='left', on='manufacturer_id', 
         suffixes=('', '_mean_supplier_id_F13.6'))
        
        list_of_dfs[i].fillna(0, inplace=True)
        
    train, val, test = list_of_dfs[0], list_of_dfs[1], list_of_dfs[2]    
    return train, val, test  

### Specify cols to TE inside CV and categorical cols for catboost

In [17]:
cols_to_target_encode = ['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'F6_age_bins',
                        'manufacturer_id', 'State_ID', 'Employment.Type', 'PRI.OVERDUE.ACCTS',
                        'PRI.ACTIVE.ACCTS', 'F7.1_DOB_Y', 'PERFORM_CNS.SCORE.DESCRIPTION', 'SEC.NO.OF.ACCTS',
                        'NO.OF_INQUIRIES', 'NEW.ACCTS.IN.LAST.SIX.MONTHS']

In [18]:
for i,j in enumerate(X_train.columns):
    print(i," --> ", j, end=" || ")

0  -->  Aadhar_flag || 1  -->  Current_pincode_ID || 2  -->  DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS || 3  -->  Date.of.Birth || 4  -->  DisbursalDate || 5  -->  Driving_flag || 6  -->  Employee_code_ID || 7  -->  Employment.Type || 8  -->  F10.1_CREDIT.HIST_Y || 9  -->  F10.2_CREDIT.HIST_M || 10  -->  F1_Manual_LTV || 11  -->  F2_difference_asset_disbursed || 12  -->  F3_avg_primary_sanctioned_amt || 13  -->  F4_avg_primary_disbursed_amt || 14  -->  F5_ratio_primary_active_overdue || 15  -->  F6_age_at_disbursal || 16  -->  F7.1_DOB_Y || 17  -->  F7.2_DOB_M || 18  -->  F7.3_DOB_D || 19  -->  F8.1_DisDate_Y || 20  -->  F8.2_DisDate_M || 21  -->  F8.3_DisDate_D || 22  -->  F9.1_AVG.ACCT.AGE_Y || 23  -->  F9.2_AVG.ACCT.AGE_M || 24  -->  MobileNo_Avl_Flag || 25  -->  NEW.ACCTS.IN.LAST.SIX.MONTHS || 26  -->  NO.OF_INQUIRIES || 27  -->  PAN_flag || 28  -->  PERFORM_CNS.SCORE || 29  -->  PERFORM_CNS.SCORE.DESCRIPTION || 30  -->  PRI.ACTIVE.ACCTS || 31  -->  PRI.CURRENT.BALANCE || 32  -->  PRI.DI

In [19]:
categorical_col_indices = [0, 1, 17, 18, 5, 8, 9, 37, 46, 6, 52, 48, 64, 43]

### Model with 10 Fold CV

In [20]:
strf_split = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#ts_split = TimeSeriesSplit(n_splits=5)

In [21]:
train_preds_list_oof_semi_stacking = []
val_auc_scores = []
test_preds_list = []

all_train_predictions = np.zeros([X_train.shape[0]])

cv_counter = 1

for train_idx, val_idx in strf_split.split(X_train, Y):
    print("***************** ", cv_counter, " *****************", end="\n\n")
    
    t_x = X_train.iloc[train_idx]
    v_x = X_train.iloc[val_idx]
    
    t_y = Y[train_idx]
    v_y = Y[val_idx]
    
    test_x = X_test.copy()

    print('Generating New Features: ')
    #generate new features
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'ltv')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.CURRENT.BALANCE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'disbursed_amount')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'asset_cost')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRIMARY.INSTAL.AMT')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PERFORM_CNS.SCORE')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F4_avg_primary_disbursed_amt')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F2_difference_asset_disbursed')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'PRI.NO.OF.ACCTS')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'F6_age_at_disbursal')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalCurrentBalance')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalInstallAmt')
    t_x, v_x, test_x = generate_averaged_features(t_x, v_x, test_x, 'TotalAccts')
    
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.ACTIVE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'PRI.OVERDUE.ACCTS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'NO.OF_INQUIRIES')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalActiveAccts')
    t_x, v_x, test_x = generate_summed_features(t_x, v_x, test_x, 'TotalOverdueAccts')
    
    
    print('Target Encoding: ')
    t_x, v_x, test_x = perform_target_encoding(cols_to_target_encode, t_x, t_y, v_x, test_x)
    
    
    print("Train Shape: ", t_x.shape)
    print("Val Shape: ", v_x.shape)
    print("Test Shape: ", test_x.shape, end="\n\n")
    
    params = {
            'eval_metric': 'AUC',
            'learning_rate': 0.01,
            'random_seed': 12321,
            'l2_leaf_reg': 15,
            'bootstrap_type': 'Bernoulli',
            #'bagging_temperature': 0.3,
            'subsample': 0.5,
            'max_depth': 8,
            'feature_border_type': 'MinEntropy',
            'thread_count': 4, 
            'objective': 'CrossEntropy',
            #'min_data_in_leaf': 100,
            'task_type': 'GPU',
            'od_type': 'Iter',
            'allow_writing_files': False,
            'boosting_type': 'Plain'
        }

    #print(t_x.iloc[:,categorical_col_indices])
    
    dtrain = cb.Pool(t_x, label=t_y, cat_features=categorical_col_indices)
    dvalid = cb.Pool(v_x, label=v_y, cat_features=categorical_col_indices)
    dtest = cb.Pool(test_x, cat_features=categorical_col_indices)
        
    model = cb.train(dtrain=dtrain, params = params, num_boost_round=8000, eval_set=[dvalid], early_stopping_rounds=500, 
        verbose_eval=200) 
    
    val_preds = model.predict(dvalid, prediction_type='Probability')
    val_score = roc_auc_score(v_y, val_preds[:,1])
    
    print(val_score)
    
    val_auc_scores.append(val_score)
    
        
    test_preds = model.predict(dtest, prediction_type='Probability')
    test_preds_list.append(test_preds[:,1])
    
    all_train_predictions[val_idx] = val_preds[:,1]
    
    cv_counter+=1
    
    print("============"*8, end="\n\n")
    del t_x, v_x, test_x, model
    gc.collect()

*****************  1  *****************

Generating New Features: 


  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.40s/it]


Train Shape:  (209837, 176)
Val Shape:  (23317, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6167791	test: 0.6230155	best: 0.6230155 (0)	total: 77.6ms	remaining: 10m 20s
200:	learn: 0.6707112	test: 0.6714445	best: 0.6714445 (200)	total: 15.8s	remaining: 10m 11s
400:	learn: 0.6810493	test: 0.6771084	best: 0.6771084 (400)	total: 31.4s	remaining: 9m 55s
600:	learn: 0.6878504	test: 0.6801603	best: 0.6801603 (600)	total: 47.1s	remaining: 9m 39s
800:	learn: 0.6932588	test: 0.6820425	best: 0.6820425 (800)	total: 1m 2s	remaining: 9m 24s
1000:	learn: 0.6979547	test: 0.6832442	best: 0.6832442 (1000)	total: 1m 18s	remaining: 9m 7s
1200:	learn: 0.7025267	test: 0.6841015	best: 0.6841015 (1200)	total: 1m 33s	remaining: 8m 51s
1400:	learn: 0.7072847	test: 0.6849849	best: 0.6849849 (1400)	total: 1m 49s	remaining: 8m 34s
1600:	learn: 0.7123059	test: 0.6856980	best: 0.6856980 (1600)	total: 2m 4s	remaining: 8m 18s
1800:	learn: 0.7170574	test: 0.6863072	best: 0.6863072 (1800)	total: 2m 20s	remaining: 8m 3

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:15<00:00,  1.39s/it]


Train Shape:  (209838, 176)
Val Shape:  (23316, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6210033	test: 0.6310947	best: 0.6310947 (0)	total: 79.5ms	remaining: 10m 35s
200:	learn: 0.6701289	test: 0.6701083	best: 0.6701083 (200)	total: 15.7s	remaining: 10m 10s
400:	learn: 0.6810117	test: 0.6761597	best: 0.6761597 (400)	total: 31.6s	remaining: 9m 59s
600:	learn: 0.6882247	test: 0.6790809	best: 0.6790809 (600)	total: 47.6s	remaining: 9m 45s
800:	learn: 0.6936496	test: 0.6806351	best: 0.6806351 (800)	total: 1m 3s	remaining: 9m 26s
1000:	learn: 0.6984667	test: 0.6815860	best: 0.6815860 (1000)	total: 1m 18s	remaining: 9m 10s
1200:	learn: 0.7032372	test: 0.6824947	best: 0.6825031 (1196)	total: 1m 34s	remaining: 8m 53s
1400:	learn: 0.7081476	test: 0.6833947	best: 0.6834019 (1398)	total: 1m 49s	remaining: 8m 37s
1600:	learn: 0.7132427	test: 0.6839151	best: 0.6839151 (1600)	total: 2m 5s	remaining: 8m 19s
1800:	learn: 0.7178706	test: 0.6844722	best: 0.6844755 (1797)	total: 2m 20s	remaining: 8m 

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.38s/it]


Train Shape:  (209838, 176)
Val Shape:  (23316, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6224090	test: 0.6135669	best: 0.6135669 (0)	total: 74ms	remaining: 9m 52s
200:	learn: 0.6698120	test: 0.6588967	best: 0.6588967 (200)	total: 15.9s	remaining: 10m 15s
400:	learn: 0.6811823	test: 0.6658657	best: 0.6658657 (400)	total: 31.7s	remaining: 10m
600:	learn: 0.6883530	test: 0.6691930	best: 0.6691930 (600)	total: 47.4s	remaining: 9m 43s
800:	learn: 0.6939388	test: 0.6710115	best: 0.6710115 (800)	total: 1m 2s	remaining: 9m 25s
1000:	learn: 0.6989383	test: 0.6722681	best: 0.6722681 (1000)	total: 1m 18s	remaining: 9m 8s
1200:	learn: 0.7035422	test: 0.6731232	best: 0.6731232 (1200)	total: 1m 33s	remaining: 8m 50s
1400:	learn: 0.7086532	test: 0.6738989	best: 0.6738989 (1400)	total: 1m 48s	remaining: 8m 33s
1600:	learn: 0.7136999	test: 0.6746976	best: 0.6746976 (1600)	total: 2m 4s	remaining: 8m 17s
1800:	learn: 0.7184489	test: 0.6752328	best: 0.6752402 (1797)	total: 2m 19s	remaining: 8m
2000:	l

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.42s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6162224	test: 0.6198521	best: 0.6198521 (0)	total: 75.9ms	remaining: 10m 7s
200:	learn: 0.6697836	test: 0.6624486	best: 0.6624486 (200)	total: 15.4s	remaining: 9m 57s
400:	learn: 0.6808368	test: 0.6691075	best: 0.6691075 (400)	total: 30.8s	remaining: 9m 43s
600:	learn: 0.6878198	test: 0.6720976	best: 0.6720990 (599)	total: 46s	remaining: 9m 25s
800:	learn: 0.6933207	test: 0.6739540	best: 0.6739540 (800)	total: 1m 1s	remaining: 9m 10s
1000:	learn: 0.6981641	test: 0.6752898	best: 0.6752898 (1000)	total: 1m 16s	remaining: 8m 55s
1200:	learn: 0.7029278	test: 0.6764778	best: 0.6764803 (1198)	total: 1m 31s	remaining: 8m 39s
1400:	learn: 0.7080299	test: 0.6775790	best: 0.6775790 (1400)	total: 1m 47s	remaining: 8m 25s
1600:	learn: 0.7131008	test: 0.6784388	best: 0.6784388 (1600)	total: 2m 2s	remaining: 8m 9s
1800:	learn: 0.7179363	test: 0.6791786	best: 0.6791786 (1800)	total: 2m 17s	remaining: 7m 54s
2

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.45s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6146754	test: 0.6215402	best: 0.6215402 (0)	total: 82.5ms	remaining: 10m 59s
200:	learn: 0.6704622	test: 0.6611506	best: 0.6611758 (199)	total: 15.7s	remaining: 10m 7s
400:	learn: 0.6815287	test: 0.6661491	best: 0.6661491 (400)	total: 31.2s	remaining: 9m 52s
600:	learn: 0.6886567	test: 0.6684213	best: 0.6684213 (600)	total: 46.7s	remaining: 9m 34s
800:	learn: 0.6941688	test: 0.6699767	best: 0.6699767 (800)	total: 1m 2s	remaining: 9m 17s
1000:	learn: 0.6992332	test: 0.6713733	best: 0.6713736 (998)	total: 1m 17s	remaining: 9m
1200:	learn: 0.7039113	test: 0.6723426	best: 0.6723426 (1200)	total: 1m 32s	remaining: 8m 44s
1400:	learn: 0.7090121	test: 0.6731371	best: 0.6731371 (1400)	total: 1m 48s	remaining: 8m 29s
1600:	learn: 0.7142613	test: 0.6737867	best: 0.6737909 (1599)	total: 2m 3s	remaining: 8m 13s
1800:	learn: 0.7191740	test: 0.6742530	best: 0.6742530 (1800)	total: 2m 18s	remaining: 7m 57s
20

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.45s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6151549	test: 0.6154524	best: 0.6154524 (0)	total: 74.9ms	remaining: 9m 59s
200:	learn: 0.6689078	test: 0.6632234	best: 0.6632234 (200)	total: 15.4s	remaining: 9m 56s
400:	learn: 0.6803861	test: 0.6708106	best: 0.6708106 (400)	total: 30.8s	remaining: 9m 42s
600:	learn: 0.6875666	test: 0.6744736	best: 0.6744736 (600)	total: 46.1s	remaining: 9m 27s
800:	learn: 0.6930436	test: 0.6765720	best: 0.6765802 (798)	total: 1m 1s	remaining: 9m 14s
1000:	learn: 0.6980210	test: 0.6779753	best: 0.6779791 (997)	total: 1m 17s	remaining: 9m
1200:	learn: 0.7027681	test: 0.6790825	best: 0.6790901 (1198)	total: 1m 32s	remaining: 8m 45s
1400:	learn: 0.7078647	test: 0.6800702	best: 0.6800702 (1400)	total: 1m 48s	remaining: 8m 29s
1600:	learn: 0.7131064	test: 0.6809216	best: 0.6809220 (1594)	total: 2m 3s	remaining: 8m 14s
1800:	learn: 0.7179102	test: 0.6815234	best: 0.6815247 (1799)	total: 2m 19s	remaining: 8m
2000:	l

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.47s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6215780	test: 0.6237437	best: 0.6237437 (0)	total: 83.8ms	remaining: 11m 10s
200:	learn: 0.6684874	test: 0.6715314	best: 0.6715314 (200)	total: 15.6s	remaining: 10m 7s
400:	learn: 0.6793643	test: 0.6792777	best: 0.6792777 (400)	total: 31.6s	remaining: 9m 58s
600:	learn: 0.6864786	test: 0.6825289	best: 0.6825289 (600)	total: 47.3s	remaining: 9m 42s
800:	learn: 0.6920188	test: 0.6846269	best: 0.6846269 (800)	total: 1m 2s	remaining: 9m 22s
1000:	learn: 0.6968067	test: 0.6859325	best: 0.6859325 (1000)	total: 1m 18s	remaining: 9m 5s
1200:	learn: 0.7013451	test: 0.6869628	best: 0.6869628 (1200)	total: 1m 33s	remaining: 8m 49s
1400:	learn: 0.7065727	test: 0.6879222	best: 0.6879222 (1400)	total: 1m 49s	remaining: 8m 34s
1600:	learn: 0.7118627	test: 0.6887027	best: 0.6887027 (1600)	total: 2m 4s	remaining: 8m 18s
1800:	learn: 0.7166115	test: 0.6891851	best: 0.6891851 (1800)	total: 2m 20s	remaining: 8m 2s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.44s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6198643	test: 0.6259515	best: 0.6259515 (0)	total: 76.1ms	remaining: 10m 8s
200:	learn: 0.6700056	test: 0.6715818	best: 0.6715818 (200)	total: 15.7s	remaining: 10m 10s
400:	learn: 0.6806133	test: 0.6786610	best: 0.6786610 (400)	total: 31.5s	remaining: 9m 56s
600:	learn: 0.6875702	test: 0.6820753	best: 0.6820753 (600)	total: 47.1s	remaining: 9m 40s
800:	learn: 0.6930848	test: 0.6841141	best: 0.6841141 (800)	total: 1m 2s	remaining: 9m 24s
1000:	learn: 0.6979096	test: 0.6853163	best: 0.6853163 (1000)	total: 1m 18s	remaining: 9m 7s
1200:	learn: 0.7023709	test: 0.6862205	best: 0.6862205 (1200)	total: 1m 33s	remaining: 8m 51s
1400:	learn: 0.7072293	test: 0.6872193	best: 0.6872193 (1400)	total: 1m 49s	remaining: 8m 36s
1600:	learn: 0.7123097	test: 0.6879499	best: 0.6879499 (1600)	total: 2m 5s	remaining: 8m 21s
1800:	learn: 0.7169424	test: 0.6885833	best: 0.6885870 (1781)	total: 2m 20s	remaining: 8m 5s

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.46s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6181399	test: 0.6136481	best: 0.6136481 (0)	total: 78.4ms	remaining: 10m 26s
200:	learn: 0.6709478	test: 0.6603266	best: 0.6603266 (200)	total: 15.3s	remaining: 9m 55s
400:	learn: 0.6818608	test: 0.6681963	best: 0.6681985 (399)	total: 30.8s	remaining: 9m 43s
600:	learn: 0.6888341	test: 0.6717324	best: 0.6717383 (599)	total: 46.1s	remaining: 9m 27s
800:	learn: 0.6942391	test: 0.6733595	best: 0.6733595 (800)	total: 1m 1s	remaining: 9m 12s
1000:	learn: 0.6989957	test: 0.6745770	best: 0.6745770 (1000)	total: 1m 17s	remaining: 8m 59s
1200:	learn: 0.7034781	test: 0.6754852	best: 0.6754852 (1200)	total: 1m 32s	remaining: 8m 44s
1400:	learn: 0.7083984	test: 0.6764102	best: 0.6764102 (1400)	total: 1m 47s	remaining: 8m 28s
1600:	learn: 0.7134727	test: 0.6772803	best: 0.6772864 (1599)	total: 2m 3s	remaining: 8m 13s
1800:	learn: 0.7181854	test: 0.6778227	best: 0.6778227 (1800)	total: 2m 18s	remaining: 7m 5

  0%|          | 0/12 [00:00<?, ?it/s]

Target Encoding: 


100%|██████████| 12/12 [00:16<00:00,  1.45s/it]


Train Shape:  (209839, 176)
Val Shape:  (23315, 176)
Test Shape:  (112392, 176)

0:	learn: 0.6169198	test: 0.6246476	best: 0.6246476 (0)	total: 82.4ms	remaining: 10m 58s
200:	learn: 0.6691535	test: 0.6693305	best: 0.6693305 (200)	total: 15.3s	remaining: 9m 52s
400:	learn: 0.6805486	test: 0.6757071	best: 0.6757071 (400)	total: 31s	remaining: 9m 47s
600:	learn: 0.6876496	test: 0.6784222	best: 0.6784222 (600)	total: 46.7s	remaining: 9m 35s
800:	learn: 0.6931221	test: 0.6797179	best: 0.6797180 (799)	total: 1m 2s	remaining: 9m 21s
1000:	learn: 0.6978671	test: 0.6806652	best: 0.6806652 (1000)	total: 1m 18s	remaining: 9m 6s
1200:	learn: 0.7024758	test: 0.6813444	best: 0.6813444 (1200)	total: 1m 33s	remaining: 8m 51s
1400:	learn: 0.7076801	test: 0.6823133	best: 0.6823170 (1399)	total: 1m 49s	remaining: 8m 36s
1600:	learn: 0.7128085	test: 0.6830282	best: 0.6830337 (1598)	total: 2m 5s	remaining: 8m 21s
1800:	learn: 0.7175096	test: 0.6835082	best: 0.6835082 (1800)	total: 2m 21s	remaining: 8m 5s
2

In [22]:
print("CV Score: ", np.mean(val_auc_scores))

CV Score:  0.6846971174280216


In [23]:
### Combine all CV preds for test
test_preds_cv = pd.DataFrame(np.asarray(test_preds_list).T).mean(axis=1).values

In [24]:
sample_submission['loan_default'] = test_preds_cv
#sample_submission['loan_default'] = sample_submission['loan_default'].rank(pct=True)
sample_submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.321274
1,723482,0.37269
2,758529,0.352622
3,763449,0.382507
4,708663,0.3188


In [25]:
train_oof_preds = train_df[['UniqueID', 'loan_default']].copy()
train_oof_preds['loan_default'] = all_train_predictions
train_oof_preds.head()

Unnamed: 0,UniqueID,loan_default
0,420825,0.432766
1,537409,0.360284
2,417566,0.401869
3,624493,0.417869
4,539055,0.277674


In [26]:
sample_submission.to_csv('cb4.csv', index=False)
#train_oof_preds.to_csv('cb4_TRAIN.csv', index=False)