In [28]:
import pandas as pd
import datetime
import numpy
import re
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf

import keras
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Function for calculating age of a person given his date of birth
from datetime import date
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [3]:
#Function for calculating number of days between two dates 
def datediff(any_date):
    today = date.today()
    datediff = today-any_date 
    return datediff.days

In [4]:
# Calculating the Tenure in months given the data is in X Years and Y months format
def age_length(TenureAsYearsAndMonths):
    years = int(re.findall(r'(\d+)yrs',TenureAsYearsAndMonths )[0])
    months = int(re.findall(r'(\d+)mon',TenureAsYearsAndMonths )[0])
    total_tenure = (years*12)+months
    return total_tenure

In [12]:
# Function for doing all the data manipulations
def AllDataManipulations(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    #Manip_Data[['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
    #    ['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES',
       'loan_default']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed

In [13]:
MasterData = pd.read_csv("train.csv")
Data1 = MasterData.copy()

In [14]:
Train_ManipData_1,Train_ManipCompleted_1 = AllDataManipulations(Data1)

In [16]:
IndeVectors = Train_ManipCompleted_1.drop(columns = ['loan_default'])
Labels = Train_ManipCompleted_1['loan_default']

In [17]:
IndeVectors.dtypes

disbursed_amount                                                                      int64
asset_cost                                                                            int64
ltv                                                                                 float64
branch_id                                                                             int64
supplier_id                                                                           int64
manufacturer_id                                                                       int64
Current_pincode_ID                                                                    int64
Age                                                                                   int64
HowManyDaysSinceDisburse                                                              int64
State_ID                                                                              int64
Employee_code_ID                                                                

In [21]:
from keras.utils import to_categorical
label_cat = to_categorical(Labels)
print(label_cat[0:10])

[[1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [22]:
IndeVectors.shape

(233154, 59)

In [23]:
# Building the actual model 
Model_NN_1 = Sequential()

# Getting No.Of Columns 
ncols = IndeVectors.shape[1]

# Defining layers
Model_NN_1.add(Dense(250, activation = 'relu',input_shape = (ncols,)))
Model_NN_1.add(Dense(250, activation = 'relu'))
Model_NN_1.add(Dense(200, activation = 'relu'))
Model_NN_1.add(Dense(100, activation = 'relu'))
Model_NN_1.add(Dense(2,activation='softmax'))

In [67]:
from keras.callbacks import EarlyStopping

#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)

In [25]:
# Defining a custom function for computing AUC, as its not predefined in keras
from keras import backend as K

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [29]:
# Compiling the model 
Model_NN_1.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics = [auc])

In [54]:
# Fitting the model
Model_NN_1.fit(IndeVectors, label_cat, epochs=100, validation_split=0.1)

Train on 209838 samples, validate on 23316 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1ec05aabfd0>

In [55]:
Ypred_NN1 = Model_NN_1.predict(Test_ManipCompleted)

In [56]:
Ypred_NN1

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [57]:
YPred_df = [numpy.argmax(y, axis=None, out=None) for y in Ypred_NN1]

In [58]:
pd.DataFrame(YPred_df).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [50]:
NN1_Submissions = pd.concat([Test_Data['UniqueID'],pd.DataFrame(YPred_df,columns = ['loan_default'])],axis=1)

In [51]:
NN1_Submissions.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,0
2,758529,0
3,763449,0
4,708663,0


In [59]:
NN1_Submissions['loan_default'].value_counts()

0    112392
Name: loan_default, dtype: int64

In [52]:
NN1_Submissions.to_csv("NN1_Submissions.csv",index=False)

In [60]:
import imblearn
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_sample(IndeVectors, label_cat)

In [65]:
y_train_smt_array = to_categorical(y_train_smt)

In [63]:
Model_NN_2 = Sequential()

# Getting No.Of Columns 
ncols_smot = X_train_smt.shape[1]

# Defining layers
Model_NN_2.add(Dense(250, activation = 'relu',input_shape = (ncols_smot,)))
Model_NN_2.add(Dense(250, activation = 'relu'))
Model_NN_2.add(Dense(200, activation = 'relu'))
Model_NN_2.add(Dense(100, activation = 'relu'))
Model_NN_2.add(Dense(2,activation='softmax'))

In [69]:
Model_NN_2.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics = [auc])
Model_NN_2.fit(X_train_smt, y_train_smt_array, epochs=30, validation_split=0.2)

Train on 292068 samples, validate on 73018 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1ec216d0c18>

In [70]:
Ypred_NN2_SmoteData = Model_NN_2.predict(Test_ManipCompleted)
YPreddf_NN2_SmoteData = [numpy.argmax(y, axis=None, out=None) for y in Ypred_NN2_SmoteData]
YPreddf_NN2_SmoteData  = pd.DataFrame(YPreddf_NN2_SmoteData,columns =['loan_default'])

In [71]:
YPreddf_NN2_SmoteData['loan_default'].value_counts()

0    112392
Name: loan_default, dtype: int64

In [72]:
Model_NN_3 = Sequential()

# Getting No.Of Columns 
ncols_smot = X_train_smt.shape[1]

# Defining layers
Model_NN_3.add(Dense(250, activation = 'relu',input_shape = (ncols_smot,)))
Model_NN_3.add(Dense(250, activation = 'relu'))
Model_NN_3.add(Dense(250, activation = 'relu'))
Model_NN_3.add(Dense(250, activation = 'relu'))
Model_NN_3.add(Dense(250, activation = 'relu'))
Model_NN_3.add(Dense(100, activation = 'relu'))
Model_NN_3.add(Dense(2,activation='softmax'))

Model_NN_3.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics = [auc])
Model_NN_3.fit(X_train_smt, y_train_smt_array, epochs=30, validation_split=0.2)

Train on 292068 samples, validate on 73018 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1ec2389d438>

In [73]:
Ypred_NN3_SmoteData = Model_NN_3.predict(Test_ManipCompleted)
YPreddf_NN3_SmoteData = [numpy.argmax(y, axis=None, out=None) for y in Ypred_NN3_SmoteData]
YPreddf_NN3_SmoteData  = pd.DataFrame(YPreddf_NN3_SmoteData,columns =['loan_default'])

In [76]:
YPreddf_NN3_SmoteData['loan_default'].value_counts()

0    112391
1         1
Name: loan_default, dtype: int64

In [None]:
Model_NN_4 = Sequential()

# Getting No.Of Columns 
ncols_smot = X_train_smt.shape[1]

# Defining layers
Model_NN_4.add(Dense(250, activation = 'relu',input_shape = (ncols_smot,)))
Model_NN_4.add(Dense(250, activation = 'relu'))
Model_NN_4.add(Dense(250, activation = 'relu'))
Model_NN_4.add(Dense(250, activation = 'relu'))
Model_NN_4.add(Dense(250, activation = 'relu'))
Model_NN_4.add(Dense(100, activation = 'relu'))
Model_NN_4.add(Dense(2,activation='softmax'))

Model_NN_4.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics = [auc])
Model_NN_4.fit(X_train_smt, y_train_smt_array, epochs=30, validation_split=0.2)

Train on 292068 samples, validate on 73018 samples
Epoch 1/30

In [None]:
Ypred_NN4_SmoteData = Model_NN_4.predict(Test_ManipCompleted)
YPreddf_NN4_SmoteData = [numpy.argmax(y, axis=None, out=None) for y in Ypred_NN4_SmoteData]
YPreddf_NN4_SmoteData  = pd.DataFrame(YPreddf_NN4_SmoteData,columns =['loan_default'])
YPreddf_NN4_SmoteData['loan_default'].value_counts()

In [32]:
# Modifying data manipulations function to work for test data as it doesn't have labels column
# Function for doing all the data manipulations
def AllDataManipulations_forTest(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    #Manip_Data[['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
    #    ['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed


In [33]:
Master_test = pd.read_csv("test_bqCt9Pv.csv")
Test_Data = Master_test.copy()
Test_ManipData,Test_ManipCompleted = AllDataManipulations_forTest(Test_Data)

In [34]:
Test_Data.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,...,0,0,0,5605,0,1,0,0yrs 8mon,1yrs 0mon,1
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,...,0,0,0,0,0,0,0,2yrs 5mon,2yrs 5mon,0
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0


In [35]:
Test_ManipCompleted.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Age',
       'HowManyDaysSinceDisburse', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AvgAcctAgeInMonths',
       'CredHistLenInMonts', 'NO.OF_INQUIRIES', 'EmploymentType_Salaried',
       'EmploymentType_Self employed',
       'Bureau_score_description_A-Very Low Risk',
       'Bureau_score_description_B-Very Low Risk',
       'Bureau_

In [36]:
Test_ManipCompleted['Bureau_score_description_Not Scored: More than 50 active Accounts found'] = 0

In [37]:
Test_ManipCompleted['Bureau_score_description_Not Scored: More than 50 active Accounts found'].unique()

array([0], dtype=int64)

In [38]:
Test_ManipCompleted.dtypes

disbursed_amount                                                                      int64
asset_cost                                                                            int64
ltv                                                                                 float64
branch_id                                                                             int64
supplier_id                                                                           int64
manufacturer_id                                                                       int64
Current_pincode_ID                                                                    int64
Age                                                                                   int64
HowManyDaysSinceDisburse                                                              int64
State_ID                                                                              int64
Employee_code_ID                                                                

ActualTestPred_rf1 = rf_model_1.predict(Test_ManipCompleted)
ActualTestPred_nb = gnb.predict(Test_ManipCompleted)
RF1_Submissions = pd.concat([Test_Data['UniqueID'],pd.DataFrame(ActualTestPred_rf1,columns=['loan_default'])],axis=1)
nb_Submissions = pd.concat([Test_Data['UniqueID'],pd.DataFrame(ActualTestPred_nb,columns=['loan_default'])],axis=1)

svm_1 = svm.SVC(kernel='linear', C=1)
svm_1.fit(train_x,list(train_y))

ypred_svm1 = svm_1.predict(valid_x)
auc_svm1 = metrics.roc_auc_score(valid_y,ypred_svm1)

In [16]:
#################################################################################################

In [19]:
Data1.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES',
       'loan_default'],
      dtype='object')

In [26]:
Data2['DisbursalDate'] = pd.to_datetime(Data1['DisbursalDate'])

In [31]:
Data2['DisbursalDate'][0].month

3

In [None]:
def quarter_finder(DataFrameWithDate,Y = pd.DataFrame()):
    for x in range(len(DataFrameWithDate)):
        Y['DisbursalDate'][x] = DataFrameWithDate['DisbursalDate'][x]
        if(DataFrameWithDate['DisbursalDate'])

In [37]:
def quarter_finder(DataFrameWithDate,Y = 0):
    if(DataFrameWithDate['DisbursalDate'].month in ['1','2','3']):
        Y = 'Q4_FY_17-18'
    elif(DataFrameWithDate['DisbursalDate'].month in ['4','5','6']):
        Y = 'Q1_FY_18-19'
    elif(DataFrameWithDate['DisbursalDate'].month in ['7','8','9']):
        Y = 'Q2_FY_18-19'
    elif(DataFrameWithDate['DisbursalDate'].month in ['10','11','12']):
        Y = 'Q3_FY_18-19'
    return Y

In [42]:
train_y.value_counts()

0    164304
1     45534
Name: loan_default, dtype: int64

In [17]:
# SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_sample(train_x, train_y)

In [20]:
print(len(X_train_smt))
print(len(y_train_smt))

328608
328608


In [24]:
shape(X_train_smt)

NameError: name 'shape' is not defined

In [23]:
rf_smt_m1 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 59) 
rf_smt_m1.fit(X_train_smt,y_train_smt) 
ypred_rf_sm_m1 = rf_smt_m1.predict(valid_x) 
print("AUC of Random forest on SMOTE data Model_1: ",metrics.roc_auc_score(valid_y,ypred_rf_sm_m1))

AUC of Random forest on SMOTE data Model_1:  0.527566570812557


In [27]:
from imblearn.under_sampling import NearMiss
nr = NearMiss()
train_x_nr, train_y_nr = nr.fit_sample(train_x,train_y)

In [29]:
rf_nr_m1 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 'sqrt') 
rf_nr_m1.fit(train_x_nr,train_y_nr) 
ypred_rf_nr_m1 = rf_nr_m1.predict(valid_x) 
print("AUC of Random forest on Near miss data Model_1: ",metrics.roc_auc_score(valid_y,ypred_rf_nr_m1))

AUC of Random forest on Near miss data Model_1:  0.4846127355702282


In [30]:
train_x.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Age,HowManyDaysSinceDisburse,State_ID,...,Bureau_score_description_K-High Risk,Bureau_score_description_L-Very High Risk,Bureau_score_description_M-Very High Risk,Bureau_score_description_No Bureau History Available,Bureau_score_description_Not Scored: More than 50 active Accounts found,Bureau_score_description_Not Scored: No Activity seen on the customer (Inactive),Bureau_score_description_Not Scored: No Updates available in last 36 months,Bureau_score_description_Not Scored: Not Enough Info available on the customer,Bureau_score_description_Not Scored: Only a Guarantor,Bureau_score_description_Not Scored: Sufficient History Not Available
132922,53303,61642,89.22,250,23098,45,1551,23,128,6,...,0,0,0,1,0,0,0,0,0,0
44426,63847,73371,88.45,2,14716,86,1669,34,182,4,...,0,0,0,0,0,0,0,0,0,0
139009,53803,63704,86.34,138,15943,45,3330,38,231,9,...,0,0,0,0,0,0,0,0,0,0
166761,37561,63740,61.19,202,23323,86,782,41,404,18,...,0,0,0,1,0,0,0,0,0,0
59850,60213,72350,84.31,9,16120,86,5479,40,201,3,...,0,0,0,0,0,0,0,0,0,0


In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier() 
AdaBoost_m1 = AdaBoostClassifier(n_estimators=100, base_estimator=dt,learning_rate=1)
AdaBoost_m1.fit(train_x,train_y)
ypred_AdaBoost_m1 = AdaBoost_m1.predict(valid_x)
print("AUC For ADABoost: ",metrics.roc_auc_score(valid_y,ypred_AdaBoost_m1))

AUC For ADABoost:  0.5205109151729628


In [32]:
rf = RandomForestClassifier() 
AdaBoost_m2 = AdaBoostClassifier(n_estimators=300, base_estimator=rf,learning_rate=1)
AdaBoost_m2.fit(train_x,train_y)
ypred_AdaBoost_m2 = AdaBoost_m1.predict(valid_x)
print("AUC For ADABoost_Using RF and 300 trees: ",metrics.roc_auc_score(valid_y,ypred_AdaBoost_m2))



















AUC For ADABoost_Using RF and 300 trees:  0.5205109151729628
