In [1]:
##Setting up basic libraries
import numpy as np
import pandas as pd
import random
import datetime as dt
import gc
import re
# Visualization libaries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
#Statistical libraries
from scipy import stats
from scipy.stats import norm, skew 
#sklearn for pre-processing the data and prepping test data
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import auc
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

from scipy.stats import ranksums


In [2]:
dataloc = 'C://Users//sandip.bhattacharjee//Desktop//AV Hackathons//input//'
outloc = 'C://Users//sandip.bhattacharjee//Desktop//AV Hackathons//output//'

In [3]:
train = pd.read_csv(dataloc + "train.csv")
test = pd.read_csv(dataloc + "test.csv")
train.shape,test.shape

((233154, 41), (112392, 40))

In [4]:
def check_nmiss(df):
    df_na = (df.isnull().sum()/len(df))*100
    df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending=False)[:30]
    missing_data = pd.DataFrame({'Missing Ratio' :df_na})
    return missing_data

In [5]:
train_miss = check_nmiss(train)
train_miss

Unnamed: 0,Missing Ratio
Employment.Type,3.285811


In [6]:
test_miss = check_nmiss(test)
test_miss

Unnamed: 0,Missing Ratio
Employment.Type,3.063385


Thus, there are no missing values other than emplyment type that needs to be treated

# Feature Creation

In [7]:
def prep_data_1(df):
    df['dob_date'] = pd.to_datetime(df['Date.of.Birth'])
    df['age']= pd.to_datetime("now") - df['dob_date']
    df['age'] = df['age'].dt.days
    df['disbursal_date'] = pd.to_datetime(df['DisbursalDate'])
    #### Seasonal features ? Can there by seasonality #####
    #df['disbursal_dayofweek'] = df['disbursal_date'].dt.dayofweek
    #df['disbursal_month'] = df['disbursal_date'].dt.month
    #df['disbursal_year'] = df['disbursal_date'].dt.year
    
    #######################################################
    df['days_since_disbursal']= pd.to_datetime("now") - df['disbursal_date']
    df['days_since_disbursal'] = df['days_since_disbursal'].dt.days
    df['tot_ch_years'] = df['CREDIT.HISTORY.LENGTH'].str.extract('^([^yrs]*).*', expand=False)
    df['tot_ch_years'] = df['tot_ch_years'].astype('int')
    df['tot_ch_months_']=df['CREDIT.HISTORY.LENGTH'].str.split(' ').str[1]
    df['tot_ch_months'] = df['tot_ch_months_'].str.extract('^([^mon]*).*', expand=False)
    df['tot_ch_months'] = df['tot_ch_months'].astype('int')
    df['total_ch_months'] = df['tot_ch_years']*12 + df['tot_ch_months']
    df.drop(['tot_ch_years','tot_ch_months_','tot_ch_months'],axis=1, inplace=True)
    df['avg_acc_age_years'] = df['AVERAGE.ACCT.AGE'].str.extract('^([^yrs]*).*', expand=False)
    df['avg_acc_age_years'] = df['avg_acc_age_years'].astype('int')
    df['avg_acc_age_month_']=df['AVERAGE.ACCT.AGE'].str.split(' ').str[1]
    df['avg_acc_age_month'] = df['avg_acc_age_month_'].str.extract('^([^mon]*).*', expand=False)
    df['avg_acc_age_month'] = df['avg_acc_age_month'].astype('int')
    df['total_acc_age_months'] = df['avg_acc_age_years']*12 + df['avg_acc_age_month']
    df.drop(['avg_acc_age_years','avg_acc_age_month_','avg_acc_age_month','disbursal_date'],axis=1, inplace=True)
    return df

In [8]:
train=prep_data_1(train)
test = prep_data_1(test)
train.shape,test.shape

((233154, 46), (112392, 45))

# Additional Features

In [9]:
def additional_feats(df):
    df['total_idproof_given']= df['Aadhar_flag']+df['PAN_flag']+df['VoterID_flag']+df['Driving_flag']+df['Passport_flag']
    df['pri_curr_balnc_pri_instal_ratio']=df['PRI.CURRENT.BALANCE']/df['PRIMARY.INSTAL.AMT']
    df['overdue_ac_to_activeAc_Ratio']=df['PRI.OVERDUE.ACCTS']/df['PRI.ACTIVE.ACCTS']
    df['Num_closed_loan_acc']=df['PRI.NO.OF.ACCTS']-df['PRI.OVERDUE.ACCTS']
    df['disbursed_to_asset_cost']= df['disbursed_amount']/df['asset_cost']
    df['ltv_diff_calc_ltv']=df['ltv']-df['disbursed_to_asset_cost']*100
    ##
    
    df['employ_type_salaried'] = np.where(df['Employment.Type']=='Salaried',1,0)
    df['employ_type_selfemplyd'] = np.where(df['Employment.Type']=='Self employed',1,0)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

In [10]:
train= additional_feats(train)
test = additional_feats(test)
train.shape,test.shape

((233154, 54), (112392, 53))

# Logistic Regression Trial

In [11]:
train_columns = [c for c in train.columns if c not in ['UniqueID', 'loan_default','dob_date',
                                                       'disbursal_date','Date.of.Birth',
                                                       'Employment.Type','DisbursalDate','PERFORM_CNS.SCORE.DESCRIPTION',
                                                       'AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION','disbursal_year',
                                                      'MobileNo_Avl_Flag','Passport_flag']]
target = train['loan_default']
test_df = test[train_columns]
folds = StratifiedKFold(n_splits=5,shuffle=True, random_state=99999)
oof = np.zeros(len(train))
#predictions_nn = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()
all_cols = train.columns.tolist()
scaler = StandardScaler()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['loan_default'].values)):
    print("fold {}".format(fold_))
    train_set = train.iloc[trn_idx][all_cols]
    val_set = train.iloc[val_idx][all_cols]
    test_df = test[train_columns]
    
    train_set.fillna(0,inplace=True)
    val_set.fillna(0,inplace=True)
    
    test_df.fillna(0,inplace=True)
    
    X_train,y_train= train_set[train_columns],train_set['loan_default']
    X_train = scaler.fit_transform(X_train)
    del train_set
    gc.collect()
    X_val,y_val = val_set[train_columns],val_set['loan_default']
    X_val = scaler.fit_transform(X_val)
    del val_set
    gc.collect()
    
    test_df_std = test_df[train_columns]
    tst_df_shape = test_df_std.shape[0]
    test_df_std = scaler.fit_transform(test_df_std)
    
    
    clf = LogisticRegression(random_state=0,penalty='l2', solver='lbfgs',fit_intercept=True,multi_class='ovr',max_iter=1000).fit(X_train,y_train)
         
    
    oof = clf.predict_proba(X_val)
    print("CV score Current Fold: {:<8.5f}".format(roc_auc_score(y_val, oof[:,1])))
    if fold_ == 0:
        predictions_lr = clf.predict_proba(test_df_std) 
    elif fold_ >0:
        predictions_lr += clf.predict_proba(test_df_std) / folds.n_splits



fold 0
CV score Current Fold: 0.62848 
fold 1
CV score Current Fold: 0.63225 
fold 2
CV score Current Fold: 0.62997 
fold 3
CV score Current Fold: 0.62937 
fold 4
CV score Current Fold: 0.62568 


In [12]:
pred_logistic = predictions_lr[:,1]
pred_logistic

array([0.4360083 , 0.51573698, 0.37350353, ..., 0.25983445, 0.43988682,
       0.23360291])

In [13]:
sub = pd.DataFrame({"UniqueID": test.UniqueID.values})
sub["loan_default"] = pred_logistic
sub.to_csv(outloc + "final_submission_logistics_04212019_v1.csv", index=False)

# Naive Bayes

In [19]:
from sklearn.naive_bayes import BernoulliNB

In [40]:
train_columns = [c for c in train.columns if c not in ['UniqueID', 'loan_default','dob_date',
                                                       'disbursal_date','Date.of.Birth',
                                                       'Employment.Type','DisbursalDate','PERFORM_CNS.SCORE.DESCRIPTION',
                                                       'AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION','disbursal_year',
                                                      'MobileNo_Avl_Flag','Passport_flag']]
target = train['loan_default']
test_df = test[train_columns]
folds = StratifiedKFold(n_splits=7,shuffle=True, random_state=99999)
oof = np.zeros(len(train))
all_cols = train.columns.tolist()
scaler = StandardScaler()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['loan_default'].values)):
    print("fold {}".format(fold_))
    train_set = train.iloc[trn_idx][all_cols]
    val_set = train.iloc[val_idx][all_cols]
    test_df = test[train_columns]
    
    train_set.fillna(0,inplace=True)
    val_set.fillna(0,inplace=True)
    
    test_df.fillna(0,inplace=True)
    
    X_train,y_train= train_set[train_columns],train_set['loan_default']
    X_train = scaler.fit_transform(X_train)
    del train_set
    gc.collect()
    X_val,y_val = val_set[train_columns],val_set['loan_default']
    X_val = scaler.fit_transform(X_val)
    del val_set
    gc.collect()
    
    test_df_std = test_df[train_columns]
    tst_df_shape = test_df_std.shape[0]
    test_df_std = scaler.fit_transform(test_df_std)
    
    
    #clf = GaussianNB( var_smoothing=0.005)
    clf = BernoulliNB(alpha=1250)
    clf.fit(X_train,y_train)
         
    
    oof = clf.predict_proba(X_val)
    print("CV score Current Fold: {:<8.5f}".format(roc_auc_score(y_val, oof[:,1])))
    if fold_ == 0:
        predictions_nb = clf.predict_proba(test_df_std) 
    elif fold_ >0:
        predictions_nb += clf.predict_proba(test_df_std) / folds.n_splits



fold 0
CV score Current Fold: 0.60838 
fold 1
CV score Current Fold: 0.60969 
fold 2
CV score Current Fold: 0.60994 
fold 3
CV score Current Fold: 0.60581 
fold 4
CV score Current Fold: 0.60421 
fold 5
CV score Current Fold: 0.60637 
fold 6
CV score Current Fold: 0.61658 


In [41]:
pred_nb = predictions_nb[:,1]
pred_nb

array([0.34441687, 0.39082651, 0.60938154, ..., 0.25832192, 0.36941329,
       0.25413693])

In [42]:
sub = pd.DataFrame({"UniqueID": test.UniqueID.values})
sub["loan_default"] = pred_nb
sub.to_csv(outloc + "final_submission_NB_localCV_605_7fold.csv", index=False)

# LightGBM Model

In [43]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

In [44]:
train_columns = [c for c in train.columns if c not in ['UniqueID', 'loan_default','dob_date',
                                                       'disbursal_date','Date.of.Birth',
                                                       'Employment.Type','DisbursalDate','PERFORM_CNS.SCORE.DESCRIPTION',
                                                       'AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION',
                                                      'PERFORM_CNS.SCORE.DESCRIPTION']]
target = train['loan_default']
test_df = test[train_columns]

In [45]:
param = {'num_leaves': 65,
         'min_sum_hessian_in_leaf': 20.0,
         'min_data_in_leaf': 80, 
         'objective':'binary',
         'tree_learner': 'feature',   
         'boost_from_average':'true',
         'max_depth': -1,
         'learning_rate': 0.004,
         "min_child_samples": 50,
         "boosting": "gbdt",
         "feature_fraction": 0.15,
         "bagging_freq": 5,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 11,
         "metric": 'auc',
         'reg_alpha': 0.35, 
         'reg_lambda': 0.25,
         "verbosity": -1,
         #categorical_feature': 'auto',
         'is_unbalanced':'True',
         "nthread": 6,
         "random_state": 99999}
folds = StratifiedKFold(n_splits=7,shuffle=True, random_state=99999)
oof = np.zeros(len(train))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['loan_default'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 100000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 1000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

fold 0
Training until validation scores don't improve for 1000 rounds.
[500]	training's auc: 0.682924	valid_1's auc: 0.665181
[1000]	training's auc: 0.692679	valid_1's auc: 0.669479
[1500]	training's auc: 0.700833	valid_1's auc: 0.672148
[2000]	training's auc: 0.708067	valid_1's auc: 0.674068
[2500]	training's auc: 0.714557	valid_1's auc: 0.675421
[3000]	training's auc: 0.720906	valid_1's auc: 0.676258
[3500]	training's auc: 0.726828	valid_1's auc: 0.676993
[4000]	training's auc: 0.732591	valid_1's auc: 0.677661
[4500]	training's auc: 0.737977	valid_1's auc: 0.678127
[5000]	training's auc: 0.74317	valid_1's auc: 0.678418
[5500]	training's auc: 0.748027	valid_1's auc: 0.678706
[6000]	training's auc: 0.752774	valid_1's auc: 0.678947
[6500]	training's auc: 0.757379	valid_1's auc: 0.679086
[7000]	training's auc: 0.761881	valid_1's auc: 0.679083
[7500]	training's auc: 0.766133	valid_1's auc: 0.679071
Early stopping, best iteration is:
[6885]	training's auc: 0.760846	valid_1's auc: 0.679151


In [46]:
sub = pd.DataFrame({"UniqueID": test.UniqueID.values})
sub["loan_default"] = predictions
sub.to_csv(outloc + "final_submission_LightGBM_CV067659_04212019.csv", index=False)

# Importing other files for blending

In [47]:
fin_outloc = "C://Users//sandip.bhattacharjee//Desktop//AV Hackathons//final_submission//"

In [48]:
nn_pred = pd.read_csv(fin_outloc + "fianl_submission_NN_04212019.csv")

In [49]:
nn_pred_array = np.array(nn_pred['loan_default'])

In [70]:
blended_pred = predictions*0.92+0.08*nn_pred_array+0*pred_logistic+0*pred_nb

In [71]:
blended_pred

array([0.25113277, 0.2766064 , 0.24333842, ..., 0.16854025, 0.24677827,
       0.1152196 ])

In [72]:
sub = pd.DataFrame({"UniqueID": test.UniqueID.values})
sub["loan_default"] = blended_pred
sub.to_csv(fin_outloc + "final_blended_sub_04212019_v7.csv", index=False)