In [38]:
import pandas as pd
import datetime
import numpy
import re
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier
import imblearn

In [None]:
## This notebook contains following trials

# Removing IDs with lot of data points - Random forest 1
# Removing all the IDs - Random forest 2 
# Smoting the data after 2nd trial - Random forest 3

In [39]:
# Function for calculating age of a person given his date of birth
from datetime import date
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [40]:
#Function for calculating number of days between two dates 
def datediff(any_date):
    today = date.today()
    datediff = today-any_date 
    return datediff.days

In [41]:
# Calculating the Tenure in months given the data is in X Years and Y months format
def age_length(TenureAsYearsAndMonths):
    years = int(re.findall(r'(\d+)yrs',TenureAsYearsAndMonths )[0])
    months = int(re.findall(r'(\d+)mon',TenureAsYearsAndMonths )[0])
    total_tenure = (years*12)+months
    return total_tenure

In [42]:
# Function for doing all the data manipulations
def AllDataManipulations(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    Manip_Data[['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
        ['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES',
       'loan_default']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed

In [43]:
MasterData = pd.read_csv("train.csv")
Data1 = MasterData.copy()

In [51]:
Train_ManipData_1,Train_ManipCompleted_1 = AllDataManipulations(Data1)

Train_ManipData_1.dtypes

In [150]:
len(Train_ManipData_1)

233154

In [153]:
Train_ManipCompleted_1_OtherMakers = Train_ManipCompleted_1[Train_ManipCompleted_1['manufacturer_id']!=86]

In [154]:
len(Train_ManipCompleted_1_OtherMakers)

123620

In [156]:
Train_ManipCompleted_1_manufacturer86 = Train_ManipCompleted_1[Train_ManipCompleted_1['manufacturer_id']==86]

In [157]:
len(Train_ManipCompleted_1_manufacturer86)

109534

In [158]:
# Independent vectors and Labels for All other manufacturers except '86'
IndeVectors_Non86 = Train_ManipCompleted_1_OtherMakers.drop(columns = ['loan_default'])
Labels_Non86 = Train_ManipCompleted_1_OtherMakers['loan_default']

In [165]:
# # Independent vectors and Labels for manufacturer except 86
IndeVectors_86 = Train_ManipCompleted_1_manufacturer86.drop(columns = ['loan_default'])
Labels_86 = Train_ManipCompleted_1_manufacturer86['loan_default']

In [159]:
# Train Test Split for All other manufacturers except 86
trainnon86_x, validnon86_x, trainnon86_y, validnon86_y = model_selection.train_test_split(IndeVectors_Non86,Labels_Non86,random_state = 1234,test_size = 0.10)

In [166]:
# Train Test Split for manufacturer 86
train86_x, valid86_x, train86_y, validn86_y = model_selection.train_test_split(IndeVectors_86,Labels_86,random_state = 1234,test_size = 0.10)

In [162]:
#Smoting data for all other manufacturers except 86
from imblearn.over_sampling import SMOTE
smt = SMOTE()
trainnon86_x_smt, trainnon86_y_smt = smt.fit_sample(trainnon86_x, trainnon86_y)

In [167]:
#Smoting data for manufacturer 86
from imblearn.over_sampling import SMOTE
smt = SMOTE()
train86_x_smt, train86_y_smt = smt.fit_sample(train86_x, train86_y)

print(Train_ManipData_1['branch_id'].value_counts()) # No need to drop.
print(Train_ManipData_1['supplier_id'].value_counts()) # Need to drop
print(Train_ManipData_1['manufacturer_id'].value_counts()) # no need to drop 
print(Train_ManipData_1['Current_pincode_ID'].value_counts()) # Need to drop
print(Train_ManipData_1['State_ID'].value_counts()) # No need to drop 
print(Train_ManipData_1['Employee_code_ID'].value_counts()) # Need to drop

In [45]:
#Train_ManipCompleted_1 = Train_ManipCompleted_1.drop(columns = ['supplier_id','Current_pincode_ID','Employee_code_ID','branch_id','manufacturer_id','State_ID'])

In [52]:
#Train_ManipCompleted_1.dtypes

In [53]:
#IndeVectors = Train_ManipCompleted_1.drop(columns = ['loan_default'])
#Labels = Train_ManipCompleted_1['loan_default']

In [54]:
#train_x, valid_x, train_y, valid_y = model_selection.train_test_split(IndeVectors,Labels,random_state = 1234,test_size = 0.10)

In [20]:
# Random forest # After removing just the IDs with lot of values, like in thousands
#rf_model_1 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 'sqrt')
#rf_model_1.fit(train_x,list(train_y))
#ypred_rf1 = rf_model_1.predict(valid_x)
#auc_rf1 = metrics.roc_auc_score(list(valid_y),ypred_rf1)
#print("AUC of Random forest Model_1: ",auc_rf1)
## Naive Bayes
#gnb = GaussianNB()
#gnb.fit(train_x,train_y)
#ypred_gnb = gnb.predict(valid_x)
#auc_gnb = metrics.roc_auc_score(list(valid_y),ypred_gnb)
#print("AUC of Naive Bayes Model_1: ",auc_gnb)

AUC of Random forest Model_1:  0.5106725526081415
AUC of Naive Bayes Model_1:  0.4978240842438261


In [26]:
'''# Random forest # After removing All the IDs
rf_model_2 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 'sqrt')
rf_model_2.fit(train_x,list(train_y))
ypred_rf2 = rf_model_2.predict(valid_x)
auc_rf2 = metrics.roc_auc_score(list(valid_y),ypred_rf2)
print("AUC of Random forest Model_1: ",auc_rf2)'''

AUC of Random forest Model_1:  0.5121761854123401


In [68]:
'''#import imblearn
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_sample(train_x, train_y)'''

In [49]:
'''# Random forest_Model 3 after Removing all the IDs and smoting the data
rf_model_3 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 'sqrt')
rf_model_3.fit(train_x,list(train_y))
ypred_rf3 = rf_model_3.predict(valid_x)
auc_rf3 = metrics.roc_auc_score(list(valid_y),ypred_rf3)
print("AUC of Random forest Model_1: ",auc_rf3)'''

AUC of Random forest Model_1:  0.512400565908616


In [50]:
'''# Finding the version of python installed
import struct;print(struct.calcsize("P") * 8)'''

64


In [161]:
# Logistic regression - First Trial
# Data used is generated through function "AllDataManipulations" - which is created just for train data

from sklearn.linear_model import LogisticRegression
LR1_non86 = LogisticRegression(class_weight = 'balanced',multi_class ='ovr',solver = 'liblinear') 
LR1_non86.fit(trainnon86_x,list(trainnon86_y, ))
ypred_lr1_non86 = LR1_non86.predict(validnon86_x)
print("AUC through Logistic Regression for all other manufacturers except 86 is: ", metrics.roc_auc_score(list(validnon86_y),ypred_lr1_non86))

AUC through Logistic Regression for all other manufacturers except 86 is:  0.5658069261020856


In [163]:
# Logistic regression - Second Trial
# Data used is generated through function "AllDataManipulations" - which is created just for train data

from sklearn.linear_model import LogisticRegression
LR2_non86 = LogisticRegression(class_weight = 'balanced',multi_class ='ovr',solver = 'liblinear') 
LR2_non86.fit(trainnon86_x_smt,list(trainnon86_y_smt, ))
ypred_lr2_non86 = LR2_non86.predict(validnon86_x)
print("AUC through Logistic Regression for all other manufacturers except 86 is - Using Smote Data: ", metrics.roc_auc_score(list(validnon86_y),ypred_lr2_non86))

AUC through Logistic Regression for all other manufacturers except 86 is - Using Smote Data:  0.5607912402025323


In [168]:
# Logistic regression - For Manufacturer 86
#from sklearn.linear_model import LogisticRegression
LR3_86 = LogisticRegression(class_weight = 'balanced',multi_class ='ovr',solver = 'liblinear') 
LR3_86.fit(train86_x_smt,list(train86_y_smt))
ypred_lr3_86 = LR3_86.predict(valid86_x)
print("AUC through Logistic Regression for manufacturer 86 is - Using Smote Data: ", metrics.roc_auc_score(list(validn86_y),ypred_lr3_86))

AUC through Logistic Regression for all other manufacturers except 86 is - Using Smote Data:  0.5839073565349296


In [169]:
'''# Trying out Decision tree
from sklearn.tree import DecisionTreeClassifier

# This uses default critetia - Gini

DT1 = DecisionTreeClassifier(max_depth=10,random_state=1234)
DT1.fit(train_x,list(train_y))
ypred_DT1 = DT1.predict(valid_x)
print("AUC through Decision tree using Obvious data manipulations is: ", metrics.roc_auc_score(list(valid_y),ypred_DT1))

DT2 = DecisionTreeClassifier(max_depth=10,random_state=1234)
DT2.fit(X_train_smt,list(y_train_smt))
ypred_DT2 = DT2.predict(valid_x)
print("AUC through Decision tree using SMOTE Data is: ", metrics.roc_auc_score(list(valid_y),ypred_DT2))'''

'# Trying out Decision tree\nfrom sklearn.tree import DecisionTreeClassifier\n\n# This uses default critetia - Gini\n\nDT1 = DecisionTreeClassifier(max_depth=10,random_state=1234)\nDT1.fit(train_x,list(train_y))\nypred_DT1 = DT1.predict(valid_x)\nprint("AUC through Decision tree using Obvious data manipulations is: ", metrics.roc_auc_score(list(valid_y),ypred_DT1))\n\nDT2 = DecisionTreeClassifier(max_depth=10,random_state=1234)\nDT2.fit(X_train_smt,list(y_train_smt))\nypred_DT2 = DT2.predict(valid_x)\nprint("AUC through Decision tree using SMOTE Data is: ", metrics.roc_auc_score(list(valid_y),ypred_DT2))'

In [89]:
'''# Decision Tree - Tweaking HyperParameters

DT3 = DecisionTreeClassifier(criterion='entropy',splitter = 'random', class_weight = 'balanced' ,random_state=1234)
DT3.fit(train_x,list(train_y))
ypred_DT3 = DT3.predict(valid_x)
print("AUC through Decision tree using Obvious data manipulations and tweaking hyperparameters: ", metrics.roc_auc_score(list(valid_y),ypred_DT3))

DT4 = DecisionTreeClassifier(criterion='entropy',splitter = 'random', class_weight = 'balanced' ,random_state=1234)
DT4.fit(X_train_smt,list(y_train_smt))
ypred_DT4 = DT4.predict(valid_x)
print("AUC through Decision tree using SMOTE Data and tweaking hyperparameters: ", metrics.roc_auc_score(list(valid_y),ypred_DT4))'''

AUC through Decision tree using Obvious data manipulations and tweaking hyperparameters:  0.5324534273725284
AUC through Decision tree using SMOTE Data and tweaking hyperparameters:  0.5246292408602246


In [93]:
'''# Using KNearest Neighbours

from sklearn.neighbors import KNeighborsClassifier

KNN1 = KNeighborsClassifier(n_neighbors=32, weights= 'uniform', algorithm= 'auto', n_jobs= -1)
KNN1.fit(train_x,list(train_y))
ypred_KNN1 = KNN1.predict(valid_x)
print("AUC with KNN1 - using obvious data manipulations: ", metrics.roc_auc_score(list(valid_y),ypred_KNN1))

KNN2 = KNeighborsClassifier(n_neighbors=32, weights= 'uniform', algorithm= 'auto', n_jobs= -1)
KNN2.fit(X_train_smt,list(y_train_smt))
ypred_KNN2 = KNN2.predict(valid_x)
print("AUC with KNN1 - using obvious data manipulations: ", metrics.roc_auc_score(list(valid_y),ypred_KNN2))'''

AUC with KNN1 - using obvious data manipulations:  0.5025899573024244
AUC with KNN1 - using obvious data manipulations:  0.5509846051599274


In [None]:
'''# Trying on SMOTE data - Without tweaking any hyperparameters

KNN3= KNeighborsClassifier(n_jobs= -1)
KNN3.fit(X_train_smt,list(y_train_smt))
ypred_KNN3 = KNN3.predict(valid_x)
print("AUC with KNN3 - using obvious data manipulations: ", metrics.roc_auc_score(list(valid_y),ypred_KNN3))'''

In [170]:
# Modifying data manipulations function to work for test data as it doesn't have labels column
# Function for doing all the data manipulations
def AllDataManipulations_forTest(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    Manip_Data[['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
        ['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed

In [171]:
Master_test = pd.read_csv("test_bqCt9Pv.csv")
Test_Data = Master_test.copy()
Test_ManipData,Test_ManipCompleted = AllDataManipulations_forTest(Test_Data)

In [172]:
Test_ManipCompleted.head()

Unnamed: 0,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Age,HowManyDaysSinceDisburse,State_ID,...,Bureau_score_description_J-High Risk,Bureau_score_description_K-High Risk,Bureau_score_description_L-Very High Risk,Bureau_score_description_M-Very High Risk,Bureau_score_description_No Bureau History Available,Bureau_score_description_Not Scored: No Activity seen on the customer (Inactive),Bureau_score_description_Not Scored: No Updates available in last 36 months,Bureau_score_description_Not Scored: Not Enough Info available on the customer,Bureau_score_description_Not Scored: Only a Guarantor,Bureau_score_description_Not Scored: Sufficient History Not Available
0,53478,63558,86.54,67,22807,45,1497,45,402,6,...,0,0,0,0,1,0,0,0,0,0
1,55513,63163,89.45,67,22807,45,1497,33,148,6,...,0,0,0,0,0,0,0,0,0,0
2,65282,84320,79.93,78,23135,86,2071,23,139,4,...,0,0,0,0,1,0,0,0,0,0
3,46905,63896,76.58,78,17014,45,2070,46,139,4,...,0,0,0,0,0,0,0,0,1,0
4,51428,63896,86.08,78,17014,45,2069,47,151,4,...,0,0,0,0,1,0,0,0,0,0


In [195]:
Test_ManipCompleted['Bureau_score_description_Not Scored: More than 50 active Accounts found'] = 0

In [196]:
len(Test_ManipCompleted.columns)

59

In [191]:
import numpy as np
np.array(Test_ManipCompleted.iloc[1]).shape

(58,)

In [194]:
train86_x.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [202]:
range(len(Test_ManipCompleted))

range(0, 112392)

In [204]:
ypred_diffModelStacker = pd.DataFrame(columns = ['loan_default'],index = range(len(Test_ManipCompleted.index)))
for i in range(len(Test_ManipCompleted)):
    if(Test_ManipCompleted['manufacturer_id'][i]==86):
        ypred_diffModelStacker['loan_default'][i] = LR3_86.predict(np.array(Test_ManipCompleted.iloc[i]).reshape(1,-1))
    else:
        ypred_diffModelStacker['loan_default'][i] = LR1_non86.predict(np.array(Test_ManipCompleted.iloc[i]).reshape(1,-1))

In [205]:
ypred_diffModelStacker.head()

Unnamed: 0,loan_default
0,[1]
1,[1]
2,[1]
3,[1]
4,[1]


In [206]:
len(ypred_diffModelStacker)

112392

In [207]:
DiffModelStacker = pd.concat([Test_Data['UniqueID'],ypred_diffModelStacker],axis=1,ignore_index=True)

In [209]:
DiffModelStacker.columns = ['UniqueID','loan_default']

In [211]:
DiffModelStacker.to_csv("DiffModelStacker.csv",index=False)

# Finding Feature importance
RF_feature_importances = pd.DataFrame(rf_model_1.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

# Function to % defaulted for each supplier
def Supplier_DefaulterFinder(TotalDataFrame,tempDf_1=pd.DataFrame(),tempDf_2 =pd.DataFrame(),tempDf_3 = pd.DataFrame()):
    tempDf_1 = pd.DataFrame(TotalDataFrame['supplier_id'].value_counts())
    tempDf_1.reset_index(level=0, inplace=True)
    tempDf_1.columns = ['supplier_id_1','TotalNoOfVehiclesSold']
    tempDf_2 = pd.DataFrame(TotalDataFrame.groupby('supplier_id')['loan_default'].value_counts())
    tempDf_2.reset_index(level=0, inplace=True)
    tempDf_2.columns = ['supplier_id_2','NoOfDefaulted_Sales']
    tempDf_2.reset_index(level=0, inplace=True)
    tempDf_2 = tempDf_2[(tempDf_2['loan_default'] == 1)]
    tempDf_1['supplier_id_1'] = tempDf_1['supplier_id_1'].astype('int')
    tempDf_2['supplier_id_2'] = tempDf_2['supplier_id_2'].astype('int')
    tempDf3 = pd.merge(tempDf_2, tempDf_1, how='right', on= ['supplier_id_2','supplier_id_1'],
                       left_index=False, right_index=False)
    tempDf3 = tempDf3[['supplier_id','TotalNoOfVehiclesSold','NoOfDefaulted_Sales']]
    tempDf3['Defaults%_DueToSupplier'] = ((tempDf3['NoOfDefaulted_Sales'])/(tempDf3['TotalNoOfVehiclesSold']))*100
    return tempDf3

In [74]:
Test_Data.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES'],
      dtype='object')

In [81]:
Testpred_LR1 = LR1.predict(Test_ManipCompleted)
Testpred_LR1_Submission = pd.concat([Test_Data['UniqueID'],pd.DataFrame(Testpred_LR1,columns = ['loan_default'])],axis =1)
Testpred_LR2 = LR2.predict(Test_ManipCompleted)
Testpred_LR2_Submission = pd.concat([Test_Data['UniqueID'],pd.DataFrame(Testpred_LR2,columns = ['loan_default'])],axis =1)

In [87]:
Testpred_LR1_Submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,1
1,723482,0
2,758529,1
3,763449,1
4,708663,1


In [83]:
Testpred_LR2_Submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,1
1,723482,1
2,758529,1
3,763449,1
4,708663,1


In [85]:
Testpred_LR1_Submission.reset_index(drop=True,inplace=True)
Testpred_LR2_Submission.reset_index(drop=True,inplace=True)

In [86]:
Testpred_LR1_Submission.to_csv("Testpred_LR1_Submission.csv")
Testpred_LR2_Submission.to_csv("Testpred_LR2_Submission.csv")

In [91]:
Testpred_DT3 = DT3.predict(Test_ManipCompleted)
Testpred_DT3_Submission = pd.concat([Test_Data['UniqueID'],pd.DataFrame(Testpred_DT3,columns = ['loan_default'])],axis =1)
Testpred_DT3_Submission.reset_index(drop=True, inplace=True)
Testpred_DT3_Submission.to_csv("Testpred_DT3_Submission.csv")

In [96]:
Testpred_KNN2 = KNN2.predict(Test_ManipCompleted)

In [103]:
Testpred_KNN2_Submission = pd.concat([Test_Data['UniqueID'],pd.DataFrame(Testpred_KNN2,columns = ['loan_default'])],axis =1)

In [104]:
Testpred_KNN2_Submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,1
2,758529,0
3,763449,0
4,708663,1


In [107]:
#Testpred_KNN2_Submission = pd.concat([Test_Data['UniqueID'],pd.DataFrame(Testpred_KNN2,columns = ['loan_default'])],axis =1)
Testpred_KNN2_Submission = Testpred_KNN2_Submission.reset_index(drop=True)
Testpred_KNN2_Submission.to_csv("Testpred_KNN2_Submission.csv")

In [106]:
Testpred_KNN2_Submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,1
2,758529,0
3,763449,0
4,708663,1


In [109]:
# Stacker

StackerPred = pd.merge(Testpred_LR1_Submission,Testpred_LR2_Submission,how='inner',left_on='UniqueID',right_on='UniqueID')

In [110]:
print(len(Testpred_LR1_Submission))
print(len(StackerPred))

112392
112392


In [111]:
StackerPred.head()

Unnamed: 0,UniqueID,loan_default_x,loan_default_y
0,655269,1,1
1,723482,0,1
2,758529,1,1
3,763449,1,1
4,708663,1,1


In [112]:
StackerPred_final = pd.merge(StackerPred,Testpred_KNN2_Submission,how='inner',left_on='UniqueID',right_on='UniqueID')

In [115]:
StackerPred_final.columns = ['UniqueID', 'loan_default_LR1', 'loan_default_LR2', 'loan_default_KNN2']

In [116]:
StackerPred_final.head()

Unnamed: 0,UniqueID,loan_default_LR1,loan_default_LR2,loan_default_KNN2
0,655269,1,1,0
1,723482,0,1,1
2,758529,1,1,0
3,763449,1,1,0
4,708663,1,1,1


In [117]:
max(StackerPred_final['loan_default_LR1'][1],StackerPred_final['loan_default_LR2'][1],StackerPred_final['loan_default_KNN2'][1])

1

In [126]:
max(StackerPred_final.iloc[1][[1,2,3]])

1

Just_List = pd.DataFrame(columns = ['UniqueID','loan_default_stacker'])
for i in range(len(StackerPred_final)):
    Just_List['UniqueID'][i] = StackerPred_final['UniqueID'][i]
    Just_List['loan_default_stacker'][i] = max(StackerPred_final['loan_default_LR1'][i],StackerPred_final['loan_default_LR2'][i],StackerPred_final['loan_default_KNN2'][i])

In [133]:
def max_finder(DataFrame,new_list = []):
    for i in range(len(DataFrame)):
        new_list.append(max(DataFrame.iloc[i][[1,2,3]]))
    return new_list

In [143]:
for index_label, row_series in StackerPred_final.iterrows():
   # For each row update the 'Bonus' value to it's double
   StackerPred_final.at[index_label , 'loan_default'] = max(row_series[['loan_default_LR1','loan_default_LR2','loan_default_KNN2']])

In [145]:
StackerPred_final['loan_default'] = StackerPred_final['loan_default'].astype('int')

In [146]:
StackerPred_final.head()

Unnamed: 0,UniqueID,loan_default_LR1,loan_default_LR2,loan_default_KNN2,loan_default
0,655269,1,1,0,1
1,723482,0,1,1,1
2,758529,1,1,0,1
3,763449,1,1,0,1
4,708663,1,1,1,1


In [147]:
StackerPred_Submission = StackerPred_final[['UniqueID','loan_default']]

In [148]:
len(StackerPred_Submission)

112392

In [149]:
StackerPred_Submission.to_csv("StackerPred_Submission.csv",index=False)

In [136]:
justlist = max_finder(StackerPred_final)

In [137]:
len(justlist)

337176

In [138]:
len(StackerPred_final)

112392