In [18]:
# To upload our datasets from our working directory we need to mount our drive contents to the colab environment. 
# For the code to do so you can search “mount” in code snippets or use the code given below. 
# Our entire drive contents are now mounted on colab at the location “/gdrive”.
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [19]:
!pip install vecstack




In [20]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score #works
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.svm import SVC
from collections import Counter #for Smote, 

import warnings
warnings.filterwarnings("ignore")


In [21]:

trainfile = r'/gdrive/My Drive/CIS508/Assignment-2/PortBank-TRAIN.csv'
train_data = pd.read_csv(trainfile)

#train_data = pd.read_csv("C:/Users/admin/Downloads/Insurance Fraud - TRAIN-3000(1).csv")


testfile = r'/gdrive/My Drive/CIS508/Assignment-2/PortBank-TEST.csv'
test_data = pd.read_csv(testfile)

#test_data = pd.read_csv("C:/Users/admin/Downloads/Insurance Fraud -TEST-12900(1).csv")


print(train_data.shape)
print(test_data.shape)
print(train_data.head())    


(4521, 17)
(45211, 17)
   age          job  marital  education  ... pdays  previous poutcome   y
0   30   unemployed  married    primary  ...    -1         0  unknown  no
1   33     services  married  secondary  ...   339         4  failure  no
2   35   management   single   tertiary  ...   330         1  failure  no
3   30   management  married   tertiary  ...    -1         0  unknown  no
4   59  blue-collar  married  secondary  ...    -1         0  unknown  no

[5 rows x 17 columns]


In [22]:
#Do one-hot encoding of categorical variables

categoricalFeatures = ["job",
"marital",
"education",
"default",
"housing",
"loan",
"contact",
"month",
"poutcome"]


#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([train_data,test_data], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data["y"]=combined_Data["y"].map({"yes":1,"no":0})
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)
print(combined_Data['y'])

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)
y_train=X_train["y"]
y_test=X_test["y"]
X_train1=X_train.iloc[:, :-1].copy()
X_test1=X_test.iloc[:, :-1].copy()


0  0        0
   1        0
   2        0
   3        0
   4        0
           ..
1  45206    1
   45207    1
   45208    1
   45209    0
   45210    0
Name: y, Length: 49732, dtype: int64


In [None]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train1,y_train)
clf_predict=clf.predict(X_test1)
print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_test1,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))


accuracy Score (training) for Decision Tree:1.000000
Confusion Matrix for Decision Tree
[[39922     0]
 [    0  5289]]


In [None]:
#Hyperparameter tuning done for decision tree classifier
parameters={'min_samples_split' : range(10,100,10),'max_depth': range(1,20,2)}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15)
clf_random.fit(X_train1, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clf.fit(X_train1,y_train)
clf_predict = clf.predict(X_test1)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (training) after hypertuning for Decision Tree:{0:6f}".format(clf.score(X_test1,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

#get cross-validation report
clf_cv_score = cross_val_score(clf, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())


{'min_samples_split': 70, 'max_depth': 9}
accuracy Score (training) after hypertuning for Decision Tree:1.000000
Confusion Matrix after hypertuning for Decision Tree
[[39922     0]
 [    0  5289]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39922
           1       1.00      1.00      1.00      5289

    accuracy                           1.00     45211
   macro avg       1.00      1.00      1.00     45211
weighted avg       1.00      1.00      1.00     45211

=== All AUC Scores ===
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


=== Mean AUC Score ===
Mean AUC Score - Decision Tree:  1.0


In [None]:
#Construct Random Forest Model

rfc = RandomForestClassifier()
rfc.fit(X_train1, y_train)
rfc_predict=rfc.predict(X_test1)
print("accuracy Score (training) for RandomForest:{0:6f}".format(rfc.score(X_test1,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))

accuracy Score (training) for RandomForest:1.000000
Confusion Matrix for Random Forest:
[[39922     0]
 [    0  5289]]


In [None]:
#Hyperparameter tuning for random forest classifier
rfc_random = RandomizedSearchCV(rfc,parameters,n_iter=15)
rfc_random.fit(X_train1, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)

#Construct Random Forest with best parameters
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train1,y_train)
rfc_predict = rfc.predict(X_test1)
print("accuracy Score (training) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test1,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))

#get cross-validation report
rfc_cv_score = cross_val_score(rfc, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ",rfc_cv_score.mean())


{'min_samples_split': 80, 'max_depth': 5}
accuracy Score (training) after hypertuning for Random Forest:0.999359
Confusion Matrix after hypertuning for Random Forest:
[[39922     0]
 [   29  5260]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39922
           1       1.00      0.99      1.00      5289

    accuracy                           1.00     45211
   macro avg       1.00      1.00      1.00     45211
weighted avg       1.00      1.00      1.00     45211

=== All AUC Scores ===
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  1.0


In [None]:
#Construct MultiLayer Perceptron Model
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=100)
mlp.fit(X_train1, y_train)
mlp_predict=mlp.predict(X_test1)
print("accuracy Score (training) for MultiLayer Perceptron:{0:6f}".format(mlp.score(X_test1,y_test)))
print("Confusion Matrix for MultiLayer Perceptron:")
print(confusion_matrix(y_test,mlp_predict))


accuracy Score (training) for MultiLayer Perceptron:0.945323
Confusion Matrix for MultiLayer Perceptron:
[[38803  1119]
 [ 1353  3936]]


In [None]:
#Hyperparameter tuning done for MultiLayer Perceptron classifier

#parameters = {'hidden_layer_sizes':[(10,), (20,)], 'activation':['tanh', 'relu'], 'solver':['sgd', 'adam'], 'alpha': [0.0001, 0.05], 'learning_rate':['constant', 'adaptive']}
#parameters = {'hidden_layer_sizes':[(10,5), (20,5)], 'activation':['tanh', 'relu'], 'learning_rate':['constant', 'adaptive']}
parameters = {'hidden_layer_sizes':[(10,5,3), (20,7,3)], 'activation':['tanh', 'relu'], 'learning_rate':['constant', 'adaptive'], 'max_iter' :[100, 150]}
#parameters = {'hidden_layer_sizes':[(10,), (15,), (10,5), (20,7,3)]}

mlp_random = RandomizedSearchCV(mlp,parameters,n_iter=15)
mlp_random.fit(X_train1, y_train)
grid_parm=mlp_random.best_params_
print(grid_parm)

#Using the parameters obtained from HyperParameterTuning in the MLPClassifier 
mlp = MLPClassifier(**grid_parm)
mlp.fit(X_train1,y_train)
mlp_predict = mlp.predict(X_test1)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (training) after hypertuning for MultiLayer Perceptron:{0:6f}".format(mlp.score(X_test1,y_test)))
print("Confusion Matrix after hypertuning for MultiLayer Perceptron")
print(confusion_matrix(y_test,mlp_predict))
print("=== Classification Report ===")
print(classification_report(y_test,mlp_predict))

#get cross-validation report
mlp_cv_score = cross_val_score(mlp, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(mlp_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - MultiLayer Perceptron: ",mlp_cv_score.mean())


{'max_iter': 150, 'learning_rate': 'constant', 'hidden_layer_sizes': (20, 7, 3), 'activation': 'tanh'}
accuracy Score (training) after hypertuning for MultiLayer Perceptron:0.964854
Confusion Matrix after hypertuning for MultiLayer Perceptron
[[38359  1563]
 [   26  5263]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     39922
           1       0.77      1.00      0.87      5289

    accuracy                           0.96     45211
   macro avg       0.89      0.98      0.92     45211
weighted avg       0.97      0.96      0.97     45211

=== All AUC Scores ===
[0.98636792 0.835625   0.7821875  0.83435096 0.86038462 0.85951923
 0.98055288 0.99173077 0.949375   0.80257212]


=== Mean AUC Score ===
Mean AUC Score - MultiLayer Perceptron:  0.888266600145138


In [None]:
#Construct K-Nearest Neighbor Model
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(X_train1, y_train)
neigh_predict=neigh.predict(X_test1)
print("accuracy Score (training) for KNeighborsClassifier:{0:6f}".format(neigh.score(X_test1,y_test)))
print("Confusion Matrix for KNeighborsClassifier:")
print(confusion_matrix(y_test,neigh_predict))


accuracy Score (training) for KNeighborsClassifier:0.880162
Confusion Matrix for KNeighborsClassifier:
[[38347  1575]
 [ 3843  1446]]


In [None]:
#Hyperparameter tuning done for K-Nearest Neighbor classifier

parameters = {'n_neighbors':[3,5,7,9,11], 'weights':['uniform', 'distance'], 'p':[1,2]}


neigh_random = RandomizedSearchCV(neigh,parameters,n_iter=15)
neigh_random.fit(X_train1, y_train)
grid_parm=neigh_random.best_params_
print(grid_parm)

#Using the parameters obtained from HyperParameterTuning in the MLPClassifier 
neigh = KNeighborsClassifier(**grid_parm)
neigh.fit(X_train1,y_train)
neigh_predict = neigh.predict(X_test1)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (training) after hypertuning for KNeighborsClassifier:{0:6f}".format(neigh.score(X_test1,y_test)))
print("Confusion Matrix after hypertuning for KNeighborsClassifier")
print(confusion_matrix(y_test,neigh_predict))
print("=== Classification Report ===")
print(classification_report(y_test,neigh_predict))

#get cross-validation report
neigh_cv_score = cross_val_score(neigh, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(neigh_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - KNeighborsClassifier: ",neigh_cv_score.mean())


{'weights': 'uniform', 'p': 1, 'n_neighbors': 9}
accuracy Score (training) after hypertuning for KNeighborsClassifier:0.887196
Confusion Matrix after hypertuning for KNeighborsClassifier
[[39152   770]
 [ 4330   959]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     39922
           1       0.55      0.18      0.27      5289

    accuracy                           0.89     45211
   macro avg       0.73      0.58      0.61     45211
weighted avg       0.86      0.89      0.86     45211

=== All AUC Scores ===
[0.78139151 0.77776442 0.70728365 0.77163462 0.80454327 0.83709135
 0.76987981 0.77536058 0.78526442 0.84980769]


=== Mean AUC Score ===
Mean AUC Score - KNeighborsClassifier:  0.7860021317126271


In [29]:
#Construct Linear Support Vector Machine Model
from sklearn.svm import LinearSVC 
linsvm = LinearSVC(max_iter=300) 
linsvm.fit(X_train1, y_train) 
linsvm_predict=linsvm.predict(X_test1) 
print("accuracy Score (training) for Linear SVM Classifier:{0:6f}".format(linsvm.score(X_test1,y_test))) 
print("Confusion Matrix for Linear SVM Classifier:") 
print(confusion_matrix(y_test,linsvm_predict))


accuracy Score (training) for Linear SVM Classifier:0.873394
Confusion Matrix for Linear SVM Classifier:
[[36871  3051]
 [ 2673  2616]]


In [32]:
#Construct Support Vector Machine Model
from sklearn.svm import SVC 
svm = SVC(max_iter=500) 
svm.fit(X_train1, y_train) 
svm_predict=svm.predict(X_test1) 
print("accuracy Score (training) for SVM Classifier:{0:6f}".format(svm.score(X_test1,y_test))) 
print("Confusion Matrix for SVM Classifier:") 
print(confusion_matrix(y_test,svm_predict))

accuracy Score (training) for SVM Classifier:0.859769
Confusion Matrix for SVM Classifier:
[[38588  1334]
 [ 5006   283]]


In [None]:
#Construct Gradient Boosting model

search_grid={'n_estimators':[5,10,20],'learning_rate':[0.01,.1]}
abc =GradientBoostingClassifier()
abc.fit(X_train1, y_train)
abc_predict=abc.predict(X_test1)
print("accuracy Score (training) for Boosting:{0:6f}".format(abc.score(X_test1,y_test)))
print("Confusion Matrix for boosting:")
print(confusion_matrix(y_test,abc_predict))
abc_random = RandomizedSearchCV(abc,search_grid,n_iter=15)
abc_random.fit(X_train1, y_train)
grid_parm_abc=abc_random.best_params_
print(grid_parm_abc)
abc= GradientBoostingClassifier(**grid_parm_abc)
abc.fit(X_train1,y_train)
abc_predict = abc.predict(X_test1)
print("accuracy Score (training) after hypertuning for Boosting:{0:6f}".format(abc.score(X_test1,y_test)))
print("Confusion Matrix after hypertuning for Boosting:")
print(confusion_matrix(y_test,abc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,abc_predict))
abc_cv_score = cross_val_score(abc, X_train1, y_train, cv=10, scoring="roc_auc")
print("=== All AUC Scores ===")
print(abc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Boosting: ",abc_cv_score.mean())

accuracy Score (training) for Boosting:1.000000
Confusion Matrix for boosting:
[[39922     0]
 [    0  5289]]
{'n_estimators': 5, 'learning_rate': 0.1}
accuracy Score (training) after hypertuning for Boosting:1.000000
Confusion Matrix after hypertuning for Boosting:
[[39922     0]
 [    0  5289]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39922
           1       1.00      1.00      1.00      5289

    accuracy                           1.00     45211
   macro avg       1.00      1.00      1.00     45211
weighted avg       1.00      1.00      1.00     45211

=== All AUC Scores ===
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


=== Mean AUC Score ===
Mean AUC Score - Boosting:  1.0


# ***SMOTE***

In [33]:
print("___________________________________________________________________\nSMOTE\n")
print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(sampling_strategy='float', ratio=0.5)
X_res, y_res = sm.fit_resample(X_train1, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

___________________________________________________________________
SMOTE

Original dataset shape Counter({0: 4000, 1: 521})
Resampled dataset shape Counter({0: 4000, 1: 2000})


# ***ENSEMBLE METHODS STACKING***

In [34]:
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [ KNeighborsClassifier(), MLPClassifier(), SVC(), LinearSVC(), RandomForestClassifier(), DecisionTreeClassifier() ]
      
S_Train, S_Test = stacking(models,                   
                           X_res, y_res, X_test1,   
                           regression=False, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
            
                           metric=accuracy_score, 
    
                           n_folds=4, 
                 
                           stratified=True,
            
                           shuffle=True,  
            
                           random_state=0,    
         
                           verbose=2)


___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [6]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.83266667]
    fold  1:  [0.83200000]
    fold  2:  [0.82600000]
    fold  3:  [0.84800000]
    ----
    MEAN:     [0.83466667] + [0.00812404]
    FULL:     [0.83466667]

model  1:     [MLPClassifier]
    fold  0:  [0.98866667]
    fold  1:  [0.96466667]
    fold  2:  [0.89066667]
    fold  3:  [0.95800000]
    ----
    MEAN:     [0.95050000] + [0.03637879]
    FULL:     [0.95050000]

model  2:     [SVC]
    fold  0:  [0.76333333]
    fold  1:  [0.74000000]
    fold  2:  [0.76666667]
    fold  3:  [0.77733333]
    ----
    MEAN:     [0.76183333] + [0.01362494]
    FULL:     [0.76183333]

model  3:     [LinearSVC]
    fold  

In [36]:
model = MLPClassifier()
    
model = model.fit(S_Train, y_res)
y_pred = model.predict(S_Test)
print('Final prediction score for ensemble methods: [%.8f]' % accuracy_score(y_test, y_pred))


Final prediction score for ensemble methods: [1.00000000]


In [37]:
model = KNeighborsClassifier()
    
model = model.fit(S_Train, y_res)
y_pred = model.predict(S_Test)
print('Final prediction score for ensemble methods: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score for ensemble methods: [1.00000000]


In [35]:
model = GradientBoostingClassifier()
    
model = model.fit(S_Train, y_res)
y_pred = model.predict(S_Test)
print('Final prediction score for ensemble methods: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score for ensemble methods: [1.00000000]


In [None]:
#Get Prediction Probability for the predicted class as a dataframe
pred_Probability =pd.DataFrame(model.predict_proba(S_Test))

pred_Probability.head()

Unnamed: 0,0,1
0,0.999783,0.000217
1,0.999783,0.000217
2,0.999783,0.000217
3,0.999783,0.000217
4,0.999783,0.000217
