# Data Pre-processing

# Load Packages

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
data = pd.read_csv("../data/interim/bank_modeling.csv")

In [4]:
bank = data.copy()

# Inspect Data

In [5]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contactType,month,dayOfWeek,...,currentCampaignContacts,daysLastContacted,previousCampaignContacts,lastCampaignOutcome,employmentRate,cpi,cci,euribor3m,noEmployed,response
0,56,housemaid,married,basic_4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high_school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high_school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic_6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high_school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [6]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       41188 non-null  int64  
 1   job                       41188 non-null  object 
 2   marital                   41188 non-null  object 
 3   education                 41188 non-null  object 
 4   default                   41188 non-null  object 
 5   housing                   41188 non-null  object 
 6   loan                      41188 non-null  object 
 7   contactType               41188 non-null  object 
 8   month                     41188 non-null  object 
 9   dayOfWeek                 41188 non-null  object 
 10  duration                  41188 non-null  int64  
 11  currentCampaignContacts   41188 non-null  int64  
 12  daysLastContacted         41188 non-null  int64  
 13  previousCampaignContacts  41188 non-null  int64  
 14  lastCa

# Feature Selection

As noted during EDA we will be dropping these features and for the following reasons:
- 'duration' - This feature was found to be highly correlated with response because a dropped call (0s duration) necessarily implies a negative response.
- 'daysLastContacted' - We observed that most of the values were '999', which was a placeholder to note that the information was missing. Over 96% of the values were '999'.
- We also noted that 'cci' and 'currentCampaignContacts' and age have a low correlation with response. We will include these features in our base model, but will consider removing these features as we tune our model.

# Splitting Features From Target

In [7]:
X = bank.drop(columns = ['daysLastContacted', 'duration', 'response'])
y = bank.response

# Encoding Categorical Variables

We will use the get_dummies function to encode categorical variables

In [8]:
#Encoding categorical variables
X_dummies = pd.get_dummies(X)
X_dummies.head()

Unnamed: 0,age,currentCampaignContacts,previousCampaignContacts,employmentRate,cpi,cci,euribor3m,noEmployed,job_admin.,job_blue-collar,...,month_oct,month_sep,dayOfWeek_fri,dayOfWeek_mon,dayOfWeek_thu,dayOfWeek_tue,dayOfWeek_wed,lastCampaignOutcome_failure,lastCampaignOutcome_nonexistent,lastCampaignOutcome_success
0,56,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,0,1.1,93.994,-36.4,4.857,5191.0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,56,1,0,1.1,93.994,-36.4,4.857,5191.0,0,0,...,0,0,0,1,0,0,0,0,1,0


# Scaling the Data

We will use a standard scaler to scale out features

In [9]:
scaler = StandardScaler()

In [10]:
X_scaled = scaler.fit_transform(X_dummies)
X_scaled[:10]

array([[ 1.53303429e+00, -5.65921974e-01, -3.49494284e-01,
         6.48092267e-01,  7.22722470e-01,  8.86446562e-01,
         7.12459879e-01,  3.31679907e-01, -5.82022825e-01,
        -5.38316990e-01, -1.91430209e-01,  6.15277204e+00,
        -2.76435300e-01, -2.08757296e-01, -1.89032128e-01,
        -3.26556400e-01, -1.47326702e-01, -4.42449272e-01,
        -1.58871662e-01, -8.98707607e-02, -3.55096625e-01,
         8.07637643e-01, -6.24937539e-01, -4.41145466e-02,
         2.97708361e+00, -2.42747539e-01, -4.14742686e-01,
        -5.48099992e-01, -2.09096045e-02, -3.81918490e-01,
        -6.47531495e-01, -2.09452960e-01,  5.13712782e-01,
        -5.13599533e-01, -8.53475566e-03,  1.10081447e+00,
        -1.56933397e-01, -1.04887691e+00,  4.61731390e-01,
        -1.56933397e-01, -4.22872127e-01, -1.31826996e+00,
         1.31826996e+00, -2.61274459e-01, -4.20076026e-01,
        -6.66211293e-02, -4.59252821e-01, -3.85042331e-01,
        -1.15906765e-01,  1.41115463e+00, -3.32532450e-0

# Creating Train / Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 77)

In [12]:
X_train.shape, X_test.shape

((28831, 61), (12357, 61))

In [13]:
y_train.shape, y_test.shape

((28831,), (12357,))

In [14]:
y_train.value_counts()

0    25520
1     3311
Name: response, dtype: int64

# Baseline Model - Logistic Regression

In [15]:
logreg = LogisticRegression(random_state = 77)

In [16]:
logreg.fit(X_train, y_train);

In [17]:
#Determine predictions for train and the test set
y_train_pred_lg = logreg.predict(X_train)
y_test_pred_lg= logreg.predict(X_test)

In [18]:
#Accuracy scores
logreg.score(X_test, y_test)

0.9031318281136198

In [19]:
#Check confusion matrix
confusion_matrix(y_test, y_test_pred_lg)

array([[10859,   169],
       [ 1028,   301]], dtype=int64)

In [20]:
#Check classification report
print(classification_report(y_test, y_test_pred_lg))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95     11028
           1       0.64      0.23      0.33      1329

    accuracy                           0.90     12357
   macro avg       0.78      0.61      0.64     12357
weighted avg       0.88      0.90      0.88     12357



In [22]:
#Check area under the curve
y_pred_prob_logreg = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob_logreg)

0.7794802640682327

# Summary

Our base model gives a good accuracy of 90%, but this score is meaningless in light of the unbalanced nature of the labels. The more reflective performance metric are the recall and precision scores of 0.23 and 0.64 respectively. Our work we turn to optimizing the F1 score for the class of positive responses ('1').  

# Extended Modelling Plan

We will extend our model by doing the following:
1. Feature selection: As noted 'cci' and 'currentCampaignContacts' were show to have a lower correlation coefficient than the other features. We will drop these features and see if performance improves.
2. Algorithms: We will trial tree-based algorithms (e.g. Random Forest, XGBoost) and nearest neigbhour models (e.g. kNN) to see if performance improves.
3. Hyper-parameter tuning: We will tune hyperparamaters appropriately for all models.
4. Resampling methods: We will use sampling methods to appropriately represent the under-represented class.
