In [235]:
import pandas as pd 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 
from dmba import regressionSummary, classificationSummary,plotDecisionTree
from sklearn import preprocessing
import matplotlib.pyplot as plt
import scikitplot as skplt
import math 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
from dmba import liftChart, gainsChart
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [236]:
# importing the data and preparing the training and validation data
customer_df = pd.read_csv('UniversalBank.csv') 
customer_df.drop( columns =['ID', 'ZIP Code'], inplace = True)
X = customer_df.drop( columns =['Personal Loan']) 
y = customer_df['Personal Loan'] 
train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size = 0.4, random_state = 1)
trainData = pd.concat([train_X,train_y.transpose()],axis=1)
validData = pd.concat([valid_X,valid_y.transpose()],axis=1)

KNN Modeling

In [237]:
#k-nearest neighbors with k = 3
# initialize normalized training, validation, and complete data frames 
# use the training data to learn the transformation.
scaler = preprocessing.StandardScaler()
scaler.fit( trainData[['Age','Experience','Income','Family','CCAvg','Education','Mortgage','Securities Account','CD Account','Online','CreditCard']]) # Use of array of column names

StandardScaler()

In [238]:
# Transform the full dataset
customer_df['Number'] = customer_df.index + 1
customerNorm = pd.concat([ pd.DataFrame( scaler.transform( customer_df[['Age','Experience','Income','Family','CCAvg','Education','Mortgage','Securities Account','CD Account','Online','CreditCard']]), columns =['zAge','zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']), customer_df[['Personal Loan','Number']]], axis = 1)

In [239]:
trainNorm = customerNorm.iloc[trainData.index] 
validNorm = customerNorm.iloc[validData.index]

In [240]:
train_X1 = trainNorm[['zAge','zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']]
train_y1 = trainNorm['Personal Loan']
valid_X1 = validNorm[['zAge','zExperience','zIncome','zFamily','zCCAvg','zEducation','zMortgage','zSecurities Account','zCD Account','zOnline','zCreditCard']] 
valid_y1 = validNorm['Personal Loan']

In [241]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_X1,train_y1)
predicted1=knn.predict(valid_X1)
classes = knn.classes_ 
validation_accuracy1=accuracy_score(valid_y1, predicted1)
print ({'Validation Accuracy': validation_accuracy1})
classificationSummary( valid_y1,predicted1, class_names = classes)

{'Validation Accuracy': 0.9545}
Confusion Matrix (Accuracy 0.9545)

       Prediction
Actual    0    1
     0 1793   14
     1   77  116




In [242]:
predProb_valid1 = knn.predict_proba(valid_X1)[:, 1]
df1 = pd.concat([ pd.DataFrame({'Personal Loan':valid_y1, 'KNN Predicted Value': predicted1}), pd.DataFrame( {'KNN Prob': predProb_valid1}, index = valid_y1.index)], axis = 1)



In [243]:
df1.head(10)

Unnamed: 0,Personal Loan,KNN Predicted Value,KNN Prob
2764,0,1,0.666667
4767,0,0,0.0
3814,0,0,0.0
3499,0,0,0.0
2735,0,0,0.0
3922,0,0,0.0
2701,0,0,0.0
1179,0,0,0.0
932,0,0,0.0
792,0,1,0.666667


Naive Bayes 

In [244]:
train_X2=train_X
valid_X2=valid_X
train_y2=train_y.astype('category')
valid_y2=valid_y.astype('category')

train_X2.Online = train_X2.Online.astype('category')
train_X2.CreditCard = train_X2.CreditCard.astype('category')
train_X2['CD Account'] = train_X2['CD Account'].astype('category')
train_X2['Securities Account'] = train_X2['Securities Account'].astype('category')
train_X2['Family'] = train_X2['Family'].astype('category')
train_X2['Education'] = train_X2['Education'].astype('category')
valid_X2.Online = valid_X2.Online.astype('category')
valid_X2.CreditCard = valid_X2.CreditCard.astype('category')
valid_X2.Family = train_X2.Family.astype('category')
valid_X2.Education = train_X2.Education.astype('category')
valid_X2['CD Account'] = valid_X2['CD Account'].astype('category')
valid_X2['Securities Account'] = valid_X2['Securities Account'].astype('category')


In [245]:
# The remaining columns (Age, Experience, Income, Mortgage and CCAvg) will be binned
train_X2['Age'] = pd.cut(train_X2['Age'], 5, labels=range(1, 6)).astype('category')
train_X2['Experience'] = pd.cut(train_X2['Experience'], 10, labels=range(1, 11)).astype('category')
train_X2['Income'] = pd.cut(train_X2['Income'], 5, labels=range(1, 6)).astype('category')
train_X2['CCAvg'] = pd.cut(train_X2['CCAvg'], 6, labels=range(1, 7)).astype('category')
train_X2['Mortgage'] = pd.cut(train_X2['Mortgage'], 10, labels=range(1, 11)).astype('category')
valid_X2['Age'] = pd.cut(valid_X2['Age'], 5, labels=range(1, 6)).astype('category')
valid_X2['Experience'] = pd.cut(valid_X2['Experience'], 10, labels=range(1, 11)).astype('category')
valid_X2['Income'] = pd.cut(valid_X2['Income'], 5, labels=range(1, 6)).astype('category')
valid_X2['CCAvg'] = pd.cut(valid_X2['CCAvg'], 6, labels=range(1, 7)).astype('category')
valid_X2['Mortgage'] = pd.cut(valid_X2['Mortgage'], 10, labels=range(1, 11)).astype('category')

In [246]:
# Create Dummies
predictors = ['Age','Experience','Income','Family','CCAvg','Education','Mortgage','Securities Account','CD Account','Online','CreditCard']
train_X2=pd.get_dummies(train_X2[predictors])
valid_X2=pd.get_dummies(valid_X2[predictors])

In [247]:
# run Naive Bayes
nb = MultinomialNB( alpha = 0.01) 
nb.fit(train_X2, train_y2)

MultinomialNB(alpha=0.01)

In [248]:
predicted2=nb.predict(valid_X2)
classes = nb.classes_ 
validation_accuracy2=accuracy_score(valid_y2, predicted2)
print ({'Validation Accuracy': validation_accuracy2})
classificationSummary( valid_y2,predicted2, class_names = classes)

{'Validation Accuracy': 0.8795}
Confusion Matrix (Accuracy 0.8795)

       Prediction
Actual    0    1
     0 1639  168
     1   73  120


In [249]:
predProb_valid2 = nb.predict_proba(valid_X2)[:, 1]
df2 = pd.concat([ pd.DataFrame({'Personal Loan':valid_y2,'NB Predicted Value': predicted2}), pd.DataFrame( {'NB Prob': predProb_valid2}, index = valid_y2.index)], axis = 1)

In [250]:
df2.head(10)

Unnamed: 0,Personal Loan,NB Predicted Value,NB Prob
2764,0,0,0.003908
4767,0,0,1e-06
3814,0,0,2e-06
3499,0,0,0.14193
2735,0,0,0.004629
3922,0,0,2e-06
2701,0,0,0.002367
1179,0,0,0.071464
932,0,1,0.718212
792,0,1,0.631708


Decision Tree Classifier

In [251]:
defaultTree = DecisionTreeClassifier( random_state = 1) 
defaultTree.fit(train_X,train_y) 
classes3 = defaultTree.classes_ 
predicted3=defaultTree.predict(valid_X)

validation_accuracy3=accuracy_score(valid_y, predicted3)
print ({'Validation Accuracy': validation_accuracy3})
classificationSummary( valid_y,predicted3, class_names = classes3)


{'Validation Accuracy': 0.8975}
Confusion Matrix (Accuracy 0.8975)

       Prediction
Actual    0    1
     0 1790   17
     1  188    5


In [252]:
predProb_valid3 = defaultTree.predict_proba(valid_X)[:, 1]
df3 = pd.concat([ pd.DataFrame({'Personal Loan':valid_y,'Default Tree Predicted Value': predicted3}), pd.DataFrame( {'Default Tree Prob': predProb_valid3}, index = valid_y1.index)], axis = 1)

In [253]:
df3.head(10)

Unnamed: 0,Personal Loan,Default Tree Predicted Value,Default Tree Prob
2764,0,0,0.0
4767,0,0,0.0
3814,0,0,0.0
3499,0,0,0.0
2735,0,0,0.0
3922,0,0,0.0
2701,0,0,0.0
1179,0,0,0.0
932,0,0,0.0
792,0,0,0.0


In [254]:
df_combined = pd.concat([df1,df2,df3],axis=1)
df_combined.head(10)

Unnamed: 0,Personal Loan,KNN Predicted Value,KNN Prob,Personal Loan.1,NB Predicted Value,NB Prob,Personal Loan.2,Default Tree Predicted Value,Default Tree Prob
2764,0,1,0.666667,0,0,0.003908,0,0,0.0
4767,0,0,0.0,0,0,1e-06,0,0,0.0
3814,0,0,0.0,0,0,2e-06,0,0,0.0
3499,0,0,0.0,0,0,0.14193,0,0,0.0
2735,0,0,0.0,0,0,0.004629,0,0,0.0
3922,0,0,0.0,0,0,2e-06,0,0,0.0
2701,0,0,0.0,0,0,0.002367,0,0,0.0
1179,0,0,0.0,0,0,0.071464,0,0,0.0
932,0,0,0.0,0,1,0.718212,0,0,0.0
792,0,1,0.666667,0,1,0.631708,0,0,0.0


In [255]:
df_combined['Average Probability'] = df_combined.iloc[:, [2,5,8]].mean(axis=1)

In [256]:
df_combined.head(10)

Unnamed: 0,Personal Loan,KNN Predicted Value,KNN Prob,Personal Loan.1,NB Predicted Value,NB Prob,Personal Loan.2,Default Tree Predicted Value,Default Tree Prob,Average Probability
2764,0,1,0.666667,0,0,0.003908,0,0,0.0,0.223525
4767,0,0,0.0,0,0,1e-06,0,0,0.0,3.898063e-07
3814,0,0,0.0,0,0,2e-06,0,0,0.0,6.778286e-07
3499,0,0,0.0,0,0,0.14193,0,0,0.0,0.04731009
2735,0,0,0.0,0,0,0.004629,0,0,0.0,0.001543003
3922,0,0,0.0,0,0,2e-06,0,0,0.0,5.088572e-07
2701,0,0,0.0,0,0,0.002367,0,0,0.0,0.0007889479
1179,0,0,0.0,0,0,0.071464,0,0,0.0,0.02382147
932,0,0,0.0,0,1,0.718212,0,0,0.0,0.239404
792,0,1,0.666667,0,1,0.631708,0,0,0.0,0.4327917


In [257]:
df_combined['Majority Vote'] = df_combined.iloc[:, [1,4,7]].mode(axis=1)[0]

In [258]:
df_combined.head(10)

Unnamed: 0,Personal Loan,KNN Predicted Value,KNN Prob,Personal Loan.1,NB Predicted Value,NB Prob,Personal Loan.2,Default Tree Predicted Value,Default Tree Prob,Average Probability,Majority Vote
2764,0,1,0.666667,0,0,0.003908,0,0,0.0,0.223525,0
4767,0,0,0.0,0,0,1e-06,0,0,0.0,3.898063e-07,0
3814,0,0,0.0,0,0,2e-06,0,0,0.0,6.778286e-07,0
3499,0,0,0.0,0,0,0.14193,0,0,0.0,0.04731009,0
2735,0,0,0.0,0,0,0.004629,0,0,0.0,0.001543003,0
3922,0,0,0.0,0,0,2e-06,0,0,0.0,5.088572e-07,0
2701,0,0,0.0,0,0,0.002367,0,0,0.0,0.0007889479,0
1179,0,0,0.0,0,0,0.071464,0,0,0.0,0.02382147,0
932,0,0,0.0,0,1,0.718212,0,0,0.0,0.239404,0
792,0,1,0.666667,0,1,0.631708,0,0,0.0,0.4327917,1


In [259]:
classificationSummary( valid_y,df_combined['Majority Vote'], class_names = classes3)

Confusion Matrix (Accuracy 0.9430)

       Prediction
Actual    0    1
     0 1798    9
     1  105   88


Bagging Tree with Decision Tree Classifier

In [260]:
bagging = BaggingClassifier( defaultTree, n_estimators = 100, random_state = 1)
bagging.fit( train_X, train_y)
classificationSummary( valid_y, bagging.predict( valid_X), class_names = classes3)

Confusion Matrix (Accuracy 0.9015)

       Prediction
Actual    0    1
     0 1800    7
     1  190    3


Boosted Tree with Decision Tree Classifier

In [261]:
boost = AdaBoostClassifier( defaultTree, n_estimators = 100, random_state = 1)
boost.fit( train_X, train_y)
classificationSummary( valid_y, boost.predict( valid_X), class_names = classes3)

Confusion Matrix (Accuracy 0.9030)

       Prediction
Actual    0    1
     0 1806    1
     1  193    0




Bagging Tree with Naive Bayes

In [262]:
bagging2 = BaggingClassifier( nb, n_estimators = 100, random_state = 1)
bagging2.fit( train_X2, train_y2)
classificationSummary( valid_y2, bagging2.predict( valid_X2), class_names = classes)

Confusion Matrix (Accuracy 0.8800)

       Prediction
Actual    0    1
     0 1640  167
     1   73  120


Boosted Tree with Naive Bayes

In [263]:
boost2 = AdaBoostClassifier(nb, n_estimators = 100, random_state = 1)
boost2.fit( train_X2, train_y2)
classificationSummary( valid_y2, boost2.predict( valid_X2), class_names = classes)

Confusion Matrix (Accuracy 0.9125)

       Prediction
Actual    0    1
     0 1718   89
     1   86  107




Bagging Tree with K-nearest neighbors with k = 3

In [264]:
bagging3 = BaggingClassifier( knn, n_estimators = 100, random_state = 1)
bagging3.fit( train_X1, train_y1)
classificationSummary( valid_y1, bagging3.predict( valid_X1), class_names = classes)

Confusion Matrix (Accuracy 0.9585)

       Prediction
Actual    0    1
     0 1797   10
     1   73  120


Boosted Tree with K-nearest neighbors with k = 3

In [265]:
boost3 = AdaBoostClassifier(knn, n_estimators = 100, random_state = 1)
boost3.fit( train_X1, train_y1)
classificationSummary( valid_y1, boost3.predict( valid_X1), class_names = classes)

ValueError: KNeighborsClassifier doesn't support sample_weight.

In [None]:
customerX1 = [[40,10,84,2,2,2,0,0,0,1,1]]
customerX2 = [[25,6,50,1,1.8,1,1,0,0,1,1]]
customerX3 = [[59,30,120,3,1.9,3,0,0,1,1,0]]

In [None]:
PredictedLoanAcceptancex1 = defaultTree.predict(customerX1)
print("Loan Acceptance by Customer 1 using Decision tree Classifier only is%s" % (PredictedLoanAcceptancex1))
PredictedLoanAcceptancex2 = bagging.predict(customerX1)
print("Loan Acceptance by Customer 1 using Decision tree Classifier and Bagging is %s" % (PredictedLoanAcceptancex2))
PredictedLoanAcceptancex3 = boost.predict(customerX1)
print("Loan Acceptance by Customer 1 using Decision tree Classifier and Boost is %s" % (PredictedLoanAcceptancex3))
PredictedLoanAcceptancex4 = defaultTree.predict(customerX2)
print("Loan Acceptance by Customer 2 using Decision tree Classifier only is%s" % (PredictedLoanAcceptancex4))
PredictedLoanAcceptancex5 = bagging.predict(customerX2)
print("Loan Acceptance by Customer 2 using Decision tree Classifier and Bagging is %s" % (PredictedLoanAcceptancex5))
PredictedLoanAcceptancex6 = boost.predict(customerX2)
print("Loan Acceptance by Customer 2 using Decision tree Classifier and Boost is %s" % (PredictedLoanAcceptancex6))
PredictedLoanAcceptancex7 = defaultTree.predict(customerX3)
print("Loan Acceptance by Customer 3 using Decision tree Classifier only is%s" % (PredictedLoanAcceptancex7))
PredictedLoanAcceptancex8 = bagging.predict(customerX3)
print("Loan Acceptance by Customer 3 using Decision tree Classifier and Bagging is %s" % (PredictedLoanAcceptancex8))
PredictedLoanAcceptancex9 = boost.predict(customerX3)
print("Loan Acceptance by Customer 3 using Decision tree Classifier and Boost is %s" % (PredictedLoanAcceptancex9))

In [None]:
PredictedLoanAcceptancex12 = knn.predict(customerX1)
print("Loan Acceptance by Customer 1 using KNN only is%s" % (PredictedLoanAcceptancex12))
PredictedLoanAcceptancex22 = bagging3.predict(customerX1)
print("Loan Acceptance by Customer 1 using KNN and Bagging is %s" % (PredictedLoanAcceptancex22))
PredictedLoanAcceptancex32 = knn.predict(customerX2)
print("Loan Acceptance by Customer 2 using KNN only is%s" % (PredictedLoanAcceptancex32))
PredictedLoanAcceptancex42 = bagging3.predict(customerX2)
print("Loan Acceptance by Customer 2 using KNN and Boost is %s" % (PredictedLoanAcceptancex42))
PredictedLoanAcceptancex52 = knn.predict(customerX3)
print("Loan Acceptance by Customer 3 using KNN only is%s" % (PredictedLoanAcceptancex52))
PredictedLoanAcceptancex62 = bagging3.predict(customerX3)
print("Loan Acceptance by Customer 3 using KNN and Bagging is %s" % (PredictedLoanAcceptancex62))

#The Naive Bayes has 51 features as input

#Thus the Customer data has to be broken down into the required 51 features which includes the binning and dummy variable

customerX_1 = [[...................]]
customerX_2 = [[...................]]
customerX_3 = [[...................]]

# the predicted values can then be calculated as follows. An easier way to calculate all the values will be to use a loop function.
PredictedLoanAcceptancex13 = nb.predict(customerX_1)

print("Loan Acceptance by Customer 1 using Naive Bayes only is%s" % (PredictedLoanAcceptancex13))

PredictedLoanAcceptancex23 = bagging2.predict(customerX_1)

print("Loan Acceptance by Customer 1 using Naive Bayes and Bagging is %s" % (PredictedLoanAcceptancex23))

PredictedLoanAcceptancex33 = boost2.predict(customerX_1)

print("Loan Acceptance by Customer 1 using Naive Bayes and Boost is %s" % (PredictedLoanAcceptancex33))

PredictedLoanAcceptancex43 = nb.predict(customerX_2)

print("Loan Acceptance by Customer 2 using Naive Bayes only is%s" % (PredictedLoanAcceptancex43))

PredictedLoanAcceptancex53 = bagging2.predict(customerX_2)

print("Loan Acceptance by Customer 2 using Naive Bayes and Bagging is %s" % (PredictedLoanAcceptancex53))

PredictedLoanAcceptancex63 = boost2.predict(customerX_2)

print("Loan Acceptance by Customer 2 using Naive Bayes and Boost is %s" % (PredictedLoanAcceptancex63))

PredictedLoanAcceptancex73 = nb.predict(customerX_3)

print("Loan Acceptance by Customer 3 using Naive Bayes only is%s" % (PredictedLoanAcceptancex73))

PredictedLoanAcceptancex83 = bagging2.predict(customerX_3)

print("Loan Acceptance by Customer 3 using Naive Bayes and Bagging is %s" % (PredictedLoanAcceptancex83))

PredictedLoanAcceptancex93 = boost2.predict(customerX_3)

print("Loan Acceptance by Customer 3 using Naive Bayes and Boost is %s" % (PredictedLoanAcceptancex93))
