# Data Pre-processing

# Load Packages

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
data = pd.read_csv("../data/interim/bank_modeling.csv")

In [3]:
bank = data.copy()

# Inspect Data

In [4]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contactType,month,dayOfWeek,...,currentCampaignContacts,daysLastContacted,previousCampaignContacts,lastCampaignOutcome,employmentRate,cpi,cci,euribor3m,noEmployed,response
0,56,housemaid,married,basic_4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high_school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high_school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic_6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high_school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [65]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       41188 non-null  int64  
 1   job                       41188 non-null  object 
 2   marital                   41188 non-null  object 
 3   education                 41188 non-null  object 
 4   default                   41188 non-null  object 
 5   housing                   41188 non-null  object 
 6   loan                      41188 non-null  object 
 7   contactType               41188 non-null  object 
 8   month                     41188 non-null  object 
 9   dayOfWeek                 41188 non-null  object 
 10  duration                  41188 non-null  int64  
 11  currentCampaignContacts   41188 non-null  int64  
 12  daysLastContacted         41188 non-null  int64  
 13  previousCampaignContacts  41188 non-null  int64  
 14  lastCa

# Feature Selection

As noted during EDA we will be dropping these features and for the following reasons:
- 'duration' - This feature was found to be highly correlated with response because a dropped call (0s duration) necessarily implies a negative response.
- 'daysLastContacted' - We observed that most of the values were '999', which was a placeholder to note that the information was missing. Over 96% of the values were '999'.
- We also noted that 'cci' and 'currentCampaignContacts' and age have a low correlation with response. We will include these features in our base model, but will consider removing these features as we tune our model.

# Splitting Features From Target

In [5]:
X = bank.drop(columns = ['daysLastContacted', 'duration', 'response'])
y = bank.response

# Encoding Categorical Variables

We will use the get_dummies function to encode categorical variables

In [7]:
#Encoding categorical variables
X_dummies = pd.get_dummies(X)
X_dummies.head()

Unnamed: 0,age,currentCampaignContacts,previousCampaignContacts,employmentRate,cpi,cci,euribor3m,noEmployed,job_admin.,job_blue-collar,...,month_oct,month_sep,dayOfWeek_fri,dayOfWeek_mon,dayOfWeek_thu,dayOfWeek_tue,dayOfWeek_wed,lastCampaignOutcome_failure,lastCampaignOutcome_nonexistent,lastCampaignOutcome_success
0,56,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,56,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0


# Scaling the Data

We will use a standard scaler to scale out features

In [8]:
scaler = StandardScaler()

In [10]:
X_scaled = scaler.fit_transform(X_dummies)
X_scaled

array([[ 1.53303429, -0.56592197, -0.34949428, ..., -0.3392905 ,
         0.39770593, -0.1857    ],
       [ 1.62899323, -0.56592197, -0.34949428, ..., -0.3392905 ,
         0.39770593, -0.1857    ],
       [-0.29018564, -0.56592197, -0.34949428, ..., -0.3392905 ,
         0.39770593, -0.1857    ],
       ...,
       [ 1.53303429, -0.20490853, -0.34949428, ..., -0.3392905 ,
         0.39770593, -0.1857    ],
       [ 0.38152696, -0.56592197, -0.34949428, ..., -0.3392905 ,
         0.39770593, -0.1857    ],
       [ 3.26029527,  0.15610492,  1.67113606, ...,  2.94732687,
        -2.51442063, -0.1857    ]])

# Creating Train / Test Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 77)

In [19]:
X_train.shape, X_test.shape

((28831, 61), (12357, 61))

In [20]:
y_train.shape, y_test.shape

((28831,), (12357,))

In [15]:
y_train.value_counts()

0    25520
1     3311
Name: response, dtype: int64

# Baseline Model - Logistic Regression

In [21]:
logreg = LogisticRegression(random_state = 77)

In [22]:
logreg.fit(X_train, y_train);

In [23]:
#Determine predictions for train and the test set
y_train_pred_lg = logreg.predict(X_train)
y_test_pred_lg= logreg.predict(X_test)

In [24]:
#Accuracy scores
logreg.score(X_test, y_test)

0.9031318281136198

In [25]:
#Check confusion matrix
confusion_matrix(y_test, y_test_pred_lg)

array([[10859,   169],
       [ 1028,   301]], dtype=int64)

In [26]:
#Check classification report
print(classification_report(y_test, y_test_pred_lg))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95     11028
           1       0.64      0.23      0.33      1329

    accuracy                           0.90     12357
   macro avg       0.78      0.61      0.64     12357
weighted avg       0.88      0.90      0.88     12357



In [27]:
#Check area under the curve
y_pred_prob_logreg = logistic.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob_logreg)

KNN

In [29]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train.values.ravel());

In [30]:
#Determine predictions for train and the test set
y_train_pred_knn = knn.predict(X_train)
y_test_pred_knn = knn.predict(X_test)

In [31]:
knn.score(X_test,y_test)

0.8976288743222465

In [32]:
confusion_matrix(y_test, y_test_pred_knn)

array([[10827,   201],
       [ 1064,   265]], dtype=int64)

In [33]:
print(classification_report(y_test, y_test_pred_knn))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94     11028
           1       0.57      0.20      0.30      1329

    accuracy                           0.90     12357
   macro avg       0.74      0.59      0.62     12357
weighted avg       0.87      0.90      0.87     12357



In [34]:
y_pred_prob_knn = knn.predict_proba(X_test)[:,1]

In [35]:
roc_auc_score(y_test, y_pred_prob_knn)

0.7257466322130166