In [1]:
#Import all necessary packages to solve these problems

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Mounted at /gdrive
/gdrive


Portuguese Bank Problem

In [2]:
#Import Target Marketing Training Data and create dataframe for it
TrainingFile = r'/gdrive/My Drive/Projects/Target Marketing/TargetMarketingTrain.csv'
TrainingData = pd.read_csv(TrainingFile) 

#Import Insurance Target Marketing Data and create dataframe for it
TestFile = r'/gdrive/My Drive/Projects/Target Marketing/TargetMarketingTest.csv'
TestData = pd.read_csv(TestFile)  

print(TrainingData.shape)
print(TestData.shape)

(4521, 17)
(45211, 17)


In [3]:
TrainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [4]:
TrainingData.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [None]:
# Evaluating how balanced our training dataset is

cases=TrainingData["y"].value_counts()
non_subscribed=np.round(100*cases[0]/(cases[0]+cases[1]),2)
subscribed=np.round(100*cases[1]/(cases[0]+cases[1]),2)

print('Customers not subscribed to term deposit', non_subscribed, '%')
print('Customers subscribed to term deposit', subscribed, '%')

Customers not subscribed to term deposit 88.48 %
Customers subscribed to term deposit 11.52 %


In [5]:
# Select features for training dataset
XTrain = TrainingData.copy()
del XTrain['y']
print(XTrain.shape)

# Identify target column for training dataset
YTrain = TrainingData[['y']].copy()
print(YTrain.shape)

# Select same features as above for test dataset
XTest = TestData.copy()
del XTest['y']
print(XTest.shape)

#Identify target column for test dataset
YTest = TestData[['y']].copy()
print(YTest.shape)

(4521, 16)
(4521, 1)
(45211, 16)
(45211, 1)


In [6]:
# Identify categorical features to be One Hot Encoded in next step

CatFeatures = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [7]:
# Perform OHE on Training Data categorical features

OneHot = OneHotEncoder(handle_unknown='ignore',sparse=False)
XCategories = pd.DataFrame(OneHot.fit_transform(XTrain[CatFeatures]),columns=OneHot.get_feature_names(),index=XTrain.index)
XTrain = pd.concat([XTrain,XCategories],axis=1)
XTrain.drop(labels=CatFeatures,axis=1,inplace=True)
XTrain.head(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,x0_housemaid,x0_management,x0_retired,x0_self-employed,x0_services,x0_student,x0_technician,x0_unemployed,x0_unknown,x1_divorced,x1_married,x1_single,x2_primary,x2_secondary,x2_tertiary,x2_unknown,x3_no,x3_yes,x4_no,x4_yes,x5_no,x5_yes,x6_cellular,x6_telephone,x6_unknown,x7_apr,x7_aug,x7_dec,x7_feb,x7_jan,x7_jul,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
0,30,1787,19,79,1,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,33,4789,11,220,1,339,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,35,1350,16,185,1,330,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,30,1476,3,199,4,-1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,59,0,5,226,1,-1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# Perform OHE on Test Data categorical features

XCategories = pd.DataFrame(OneHot.transform(XTest[CatFeatures]),columns=OneHot.get_feature_names(),index=XTest.index)
XTest = pd.concat([XTest,XCategories],axis=1)
XTest.drop(labels=CatFeatures,axis=1,inplace=True)
XTest.head(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,x0_housemaid,x0_management,x0_retired,x0_self-employed,x0_services,x0_student,x0_technician,x0_unemployed,x0_unknown,x1_divorced,x1_married,x1_single,x2_primary,x2_secondary,x2_tertiary,x2_unknown,x3_no,x3_yes,x4_no,x4_yes,x5_no,x5_yes,x6_cellular,x6_telephone,x6_unknown,x7_apr,x7_aug,x7_dec,x7_feb,x7_jan,x7_jul,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
0,58,2143,5,261,1,-1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,44,29,5,151,1,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,33,2,5,76,1,-1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47,1506,5,92,1,-1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,33,1,5,198,1,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
# Instantiate Decision Tree Model and fit Training Data to it
dt = DecisionTreeClassifier()
dt.fit(XTrain, YTrain)

#Set up prediction with Test Data
Y_Prediction = dt.predict(XTest)
Y_Prediction = pd.DataFrame(Y_Prediction, columns=['y'])

In [10]:
# Instantiate Random Forest Model and fit Training Data to it

rf = RandomForestClassifier()
rf.fit(XTrain, YTrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
#Set up predictions for decision tree training and test data
X_Prediction_Test = dt.predict(XTest)
X_Prediction_Train = dt.predict(XTrain)

#Evaluate accuracy of decision tree training and test data
print("Accuracy of Training is - ", metrics.accuracy_score(YTrain,X_Prediction_Train))
print("Accuracy of Test is -", metrics.accuracy_score(YTest,X_Prediction_Test))

#Review Confusion Matrix
print("Confusion Matrix - Decision Tree")
print(confusion_matrix(YTest,X_Prediction_Test))
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())
print('Additional important metrics')
print(metrics.classification_report(YTest, X_Prediction_Test))

Accuracy of Training is -  1.0
Accuracy of Test is - 0.8813341885824246
Confusion Matrix - Decision Tree
[[37118  2804]
 [ 2561  2728]]
Max Depth 26
Leaf 384
Additional important metrics
              precision    recall  f1-score   support

          no       0.94      0.93      0.93     39922
         yes       0.49      0.52      0.50      5289

    accuracy                           0.88     45211
   macro avg       0.71      0.72      0.72     45211
weighted avg       0.88      0.88      0.88     45211



In [12]:
#Set up predictions for random forest training and test data
X_Prediction_Test_RF = rf.predict(XTest)
X_Prediction_Train_RF = rf.predict(XTrain)

#Evaluate accuracy of random forest training and test data
print("Accuracy of Training is - ", metrics.accuracy_score(YTrain,X_Prediction_Train_RF))
print("Accuracy of Test is - ", metrics.accuracy_score(YTest,X_Prediction_Test_RF))

#Review confusion matrix
print("Confusion Matrix - Random Forest")
print(confusion_matrix(YTest,X_Prediction_Test_RF))
print('Additional important metrics')
print(metrics.classification_report(YTest, X_Prediction_Test_RF))

Accuracy of Training is -  1.0
Accuracy of Test is -  0.9122558669350379
Confusion Matrix - Random Forest
[[39232   690]
 [ 3277  2012]]
Additional important metrics
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.74      0.38      0.50      5289

    accuracy                           0.91     45211
   macro avg       0.83      0.68      0.73     45211
weighted avg       0.90      0.91      0.90     45211



Hyperparameter Tuning - Decision

In [34]:
#Perform Random Search for decision tree

import time
start_time = time.time()

print("Random Search - Decision Tree")
param={'max_depth': range(5,15,1),'criterion':['gini','entropy']}
dt_random = RandomizedSearchCV(dt,param,n_iter=10,cv=5)
dt_random.fit(XTrain, YTrain)
grid_parameters=dt_random.best_params_
print(grid_parameters)
print("Decision Tree accuracy is --> {0:6f}".
      format(dt_random.score(XTest,YTest)))

print("--- %s seconds to compute ---" % (time.time() - start_time))

Random Search - Decision Tree
{'max_depth': 9, 'criterion': 'gini'}
Decision Tree accuracy is --> 0.899759
--- 1.6445610523223877 seconds to compute ---


In [37]:
#Perform Grid Search for decision tree

import time
start_time = time.time()

print("Decision Tree Grid Search - ")
dt_grid = GridSearchCV(dt,param)
dt_grid.fit(XTrain, YTrain)
grid_parameters1=dt_grid.best_params_
print(grid_parameters1)
print("accuracy Score for Decision Tree --> {0:6f}".
      format(dt_grid.score(XTest,YTest)))

print("--- %s seconds to compute ---" % (time.time() - start_time))

Decision Tree Grid Search - 
{'criterion': 'entropy', 'max_depth': 5}
accuracy Score for Decision Tree --> 0.897790
--- 3.0522849559783936 seconds to compute ---


In [38]:
#Utilizing new parameters in the decision tree
dtRandomSearch = DecisionTreeClassifier(**grid_parameters)
dtGridSearch = DecisionTreeClassifier(**grid_parameters1)

#Fitting random search hypertuned parameters to model
dtRandomSearch.fit(XTrain,YTrain)
dtRandomSearch_Pred = dtRandomSearch.predict(XTest)

#Fitting grid search hypertuned parameters to model
dtGridSearch.fit(XTrain,YTrain)
dtGridSearch_Pred = dtGridSearch.predict(XTest)

In [39]:
# Evaluating accuracy of model using new hypertuned parameters (Random Search)
print("Accuracy of Random Search -", metrics.accuracy_score(YTest,dtRandomSearch_Pred))
print("Confusion Matrix of Random Search Decision Tree Model")
print(confusion_matrix(YTest,dtRandomSearch_Pred))
print('Additional important metrics')
print(metrics.classification_report(YTest, dtRandomSearch_Pred))

#Evaluating Cross Validation Score
CrossValidation_Score = cross_val_score(dtRandomSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Accuracy of Random Search - 0.899891619296189
Confusion Matrix of Random Search Decision Tree Model
[[38627  1295]
 [ 3231  2058]]
Additional important metrics
              precision    recall  f1-score   support

          no       0.92      0.97      0.94     39922
         yes       0.61      0.39      0.48      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.68      0.71     45211
weighted avg       0.89      0.90      0.89     45211

[0.65642857 0.64326923 0.66331731 0.67524038 0.70240385]


In [40]:
# Evaluating accuracy of model using new hypertuned parameters (Grid Search)
print("Accuracy of Grid Search -", metrics.accuracy_score(YTest,dtGridSearch_Pred))
print("Confusion Matrix of Grid Search Decision Tree Model")
print(confusion_matrix(YTest,dtGridSearch_Pred))
print('Additional Important Metrics')
print(metrics.classification_report(YTest, dtGridSearch_Pred))

#Evaluating Cross Validation Score
CrossValidation_Score = cross_val_score(dtGridSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Accuracy of Grid Search - 0.8983433235274602
Confusion Matrix of Grid Search Decision Tree Model
[[38981   941]
 [ 3655  1634]]
Additional Important Metrics
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.63      0.31      0.42      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.64      0.68     45211
weighted avg       0.88      0.90      0.88     45211

[0.66657738 0.664375   0.64490385 0.62043269 0.60182692]


Random Forest Hyperparameter tuning- Random and Grid Search

In [111]:
#Perform Random Search for Random Forest
import time
start_time = time.time()

print("Random Forest - Random Search")
Random_Param={'max_features' : range(10,30,2),'max_depth': 
            range(50,100,10), 'n_estimators':[80,150,10]}
RF_RandomSearch = RandomizedSearchCV(rf,Random_Param,n_iter=10,cv=5)

#Fitting new parameters to Random Forest Model
RF_RandomSearch.fit(XTrain, YTrain)
grid_parameters=RF_RandomSearch.best_params_
print(grid_parameters)
print((RF_RandomSearch.score(XTest,YTest)))

print("--- %s seconds ---" % (time.time() - start_time))

Random Forest - Random Search
{'n_estimators': 150, 'max_features': 16, 'max_depth': 50}
0.9132954369511844
--- 35.319870710372925 seconds ---


In [114]:
import time
start_time = time.time()

print("GridSearchCV-Random Forest")
rf_grid = GridSearchCV(rf,Random_Param)
rf_grid.fit(XTrain, YTrain)
grid_parm1=rf_grid.best_params_
print(grid_parm1)
print("accuracy Score for Decision Tree:{0:6f}".
      format(rf_grid.score(XTest,YTest)))

print("--- %s seconds ---" % (time.time() - start_time))

GridSearchCV-Random Forest
{'max_depth': 60, 'max_features': 18, 'n_estimators': 150}
accuracy Score for Decision Tree:0.913893
--- 523.1104764938354 seconds ---


In [115]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier 
rfRandomSearch = RandomForestClassifier(**grid_parameters)
rfGridSearch = RandomForestClassifier(**grid_parm1)

rfRandomSearch.fit(XTrain,YTrain)
rfRandomSearch_Prediction = rfRandomSearch.predict(XTest)
rfGridSearch.fit(XTrain,YTrain)
rfGridSearch_Prediction = rfGridSearch.predict(XTest)

In [116]:
# Accuracy for Random Forest using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(YTest,rfRandomSearch_Prediction))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(YTest,rfRandomSearch_Prediction))
print('Other important metrics')
print(metrics.classification_report(YTest, rfRandomSearch_Prediction))
CrossValidation_Score = cross_val_score(rfRandomSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Test Accuracy: 0.91336179248413
Confusion Matrix for Decision Tree:
[[38883  1039]
 [ 2878  2411]]
Other important metrics
              precision    recall  f1-score   support

          no       0.93      0.97      0.95     39922
         yes       0.70      0.46      0.55      5289

    accuracy                           0.91     45211
   macro avg       0.81      0.71      0.75     45211
weighted avg       0.90      0.91      0.91     45211

[0.65705357 0.66519231 0.66581731 0.66288462 0.675     ]


In [117]:
# Accuracy for Random Forest using Grid Search for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(YTest,rfGridSearch_Prediction))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(YTest,rfGridSearch_Prediction))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(YTest, rfGridSearch_Prediction))

#Evaluating Cross Validation Score
CrossValidation_Score = cross_val_score(dtGridSearch, XTrain, YTrain, cv=5, scoring="balanced_accuracy")
print(CrossValidation_Score)

Test Accuracy: 0.913627214615912
Confusion Matrix for Decision Tree:
[[38810  1112]
 [ 2793  2496]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.93      0.97      0.95     39922
         yes       0.69      0.47      0.56      5289

    accuracy                           0.91     45211
   macro avg       0.81      0.72      0.76     45211
weighted avg       0.90      0.91      0.91     45211

[0.67258929 0.66375    0.64490385 0.62043269 0.60182692]
