# C. Data Pre-processing

# Load Packages

In [62]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report, fbeta_score
from sklearn.neighbors import KNeighborsClassifier
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [33]:
data = pd.read_csv("../data/interim/bank_modeling.csv")

In [34]:
bank = data.copy()

# Inspect Data

In [35]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contactType,month,dayOfWeek,...,currentCampaignContacts,daysLastContacted,previousCampaignContacts,lastCampaignOutcome,employmentRate,cpi,cci,euribor3m,noEmployed,response
0,56,housemaid,married,basic_4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high_school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high_school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic_6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high_school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [36]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       41188 non-null  int64  
 1   job                       41188 non-null  object 
 2   marital                   41188 non-null  object 
 3   education                 41188 non-null  object 
 4   default                   41188 non-null  object 
 5   housing                   41188 non-null  object 
 6   loan                      41188 non-null  object 
 7   contactType               41188 non-null  object 
 8   month                     41188 non-null  object 
 9   dayOfWeek                 41188 non-null  object 
 10  duration                  41188 non-null  int64  
 11  currentCampaignContacts   41188 non-null  int64  
 12  daysLastContacted         41188 non-null  int64  
 13  previousCampaignContacts  41188 non-null  int64  
 14  lastCa

# Feature Selection

As noted during EDA we will be dropping these features and for the following reasons:
- 'duration' - This feature was found to be highly correlated with response because a dropped call (0s duration) necessarily implies a negative response. The duration is not known before the call is made. Including this feature in our model may lead to data leakage. 
- 'daysLastContacted' - We observed that most of the values were '999', which was a placeholder to note that the information was missing. Over 96% of the values were '999'. We will consider removing this feature as we tune our model.
- We also noted that 'cci' and 'currentCampaignContacts' and age have a low correlation with response. We will include these features in our base model, but will consider removing these features as we tune our model.

# Splitting Features From Target

In [37]:
#Converting the subscribed variable from a boolean feature to numerical feature
bank.response = bank.response.replace({'yes': 1, 'no': 0})

In [38]:
X = bank.drop(columns = ['duration', 'response'])
y = bank.response

In [39]:
feature_names = X.columns

# Encoding Categorical Variables

We will use the get_dummies function to encode categorical variables

In [65]:
#Encoding categorical variables
X_dummies = pd.get_dummies(X)
features_coded = X_dummies.columns
X_dummies.head()

Unnamed: 0,age,currentCampaignContacts,daysLastContacted,previousCampaignContacts,employmentRate,cpi,cci,euribor3m,noEmployed,job_admin.,...,month_oct,month_sep,dayOfWeek_fri,dayOfWeek_mon,dayOfWeek_thu,dayOfWeek_tue,dayOfWeek_wed,lastCampaignOutcome_failure,lastCampaignOutcome_nonexistent,lastCampaignOutcome_success
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,1,...,0,0,0,1,0,0,0,0,1,0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0


# Scaling the Data

We will use a standard scaler to scale out features

In [41]:
scaler = StandardScaler()

In [42]:
X_scaled = scaler.fit_transform(X_dummies)

# Creating Train / Test Split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, test_size = 0.3, random_state = 77)

In [44]:
X_train.shape, X_test.shape

((28831, 62), (12357, 62))

In [45]:
y_train.shape, y_test.shape

((28831,), (12357,))

In [46]:
y_train.value_counts()

0    25583
1     3248
Name: response, dtype: int64

# Baseline Model - Logistic Regression

The baseline algorithm that we shall use to create a model is logistic regression.

In [47]:
logreg = LogisticRegression(random_state = 77)

In [48]:
logreg.fit(X_train, y_train);

In [49]:
#Determine predictions for train and the test set
y_train_pred_lg = logreg.predict(X_train)
y_test_pred_lg= logreg.predict(X_test)

# Model Assessment

## Imbalanced Dataset

In [50]:
#Get the number of negative and positive responses
y.value_counts()

0    36548
1     4640
Name: response, dtype: int64

In [51]:
#Get the ratio of positive to negative responses
y.value_counts()[0] / y.value_counts()[1]

7.876724137931035

Our data is imbalanced. The ratio of positive responses to negative responses is roughly 1 to 8. It is therefore very likely that the typical metrics will not yield favorable outcomes. 

## Metrics

It is important that our model is able to prioritize the prediction of positive responses, in this case, a 'yes' indicating that a potential client will accept a marketing request to open a term-deposit account. Our model must find these clients more often than nought. 

Our metric of relevance will therefore be recall. Recall is the proportion of relevant cases that is selected by the model. We define our positive case as the case related to clients accepting a request to open a term-deposit account (i.e. the 'yes' or '1' response value). Relevant cases are actual positive cases (predicted or not). The selected cases are the positive cases that were appropriately predicted by the model. 

For this project, we want to err on the side of predicting positive cases, meaning that we will favor a probability threshold for selecting positive cases. Accordingly, it is very likely that our precision suffers and we get a higher false positive rate. 

In reality, there are costs associated with making wrong predictions. This means that we will be contacting clients that are unlikely to favorably respond to the marketing campaign. There should therefore be a cost / benefit analysis done to determine the trade-off between precision and recall when choosing a model.

We shall use a classification report to review model performance, with a focus on the positive case's recall. Furthermore, it should be noted that the classification report will change based on the probability threshold for predicting the positive case - the default probability being 0.5. The Area Under the Curve (AUC) gives us a metric for all such thresholds for the positive case from 0 to 1. 

In [52]:
#Accuracy scores
logreg.score(X_test, y_test)

0.9019179412478757

In [53]:
#Check confusion matrix
confusion_matrix(y_test, y_test_pred_lg)

array([[10792,   173],
       [ 1039,   353]], dtype=int64)

In [54]:
#Check classification report
print(classification_report(y_test, y_test_pred_lg))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95     10965
           1       0.67      0.25      0.37      1392

    accuracy                           0.90     12357
   macro avg       0.79      0.62      0.66     12357
weighted avg       0.89      0.90      0.88     12357



In [55]:
#Check area under the curve
y_pred_prob_logreg = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob_logreg)

0.7968221771467208

In [63]:
#Determine area under precision-recall curve
average_precision_score(y_test, y_pred_prob_logreg)

0.4640770191901702

In [56]:
#F2 score
fbeta_score(y_test, y_test_pred_lg, beta = 2)

0.2896291434197571

# Summary

Our base model gives a good accuracy of 90%, but this score is meaningless in light of the unbalanced nature of the labels. The more reflective performance metric are the recall and precision scores of 0.23 and 0.64 respectively. Our work we turn to optimizing the F1 score for the class of positive responses ('1').  

# Saving

In [57]:
import pickle

In [58]:
filepath = '../data/interim/'

In [66]:
pickle.dump(logreg, open(filepath + 'logreg_base.pkl', 'wb'))
pickle.dump(X_test, open(filepath + 'X_test_base.pkl', 'wb'))
pickle.dump(y_test, open(filepath + 'y_test_base.pkl', 'wb'))
pickle.dump(X_train, open(filepath + 'X_train_base.pkl', 'wb'))
pickle.dump(y_train, open(filepath + 'y_train_base.pkl', 'wb'))
pickle.dump(feature_names, open(filepath + 'feature_names_base.pkl', 'wb'))
pickle.dump(features_coded, open(filepath + 'features_coded_base.pkl', 'wb'))

In [60]:
pickle.dump(X, open(filepath + 'X.pkl', 'wb'))

In [61]:
pickle.dump(y, open(filepath + 'y.pkl', 'wb'))

# Extended Modelling Plan

We will extend our model by doing the following:
1. Feature selection: As noted 'cci' and 'currentCampaignContacts' were show to have a lower correlation coefficient than the other features. We will drop these features and see if performance improves.
2. Algorithms: We will trial tree-based algorithms (e.g. Random Forest, XGBoost) and nearest neigbhour models (e.g. kNN) to see if performance improves.
3. Hyper-parameter tuning: We will tune hyperparamaters appropriately for all models.
4. Resampling methods: We will use sampling methods to appropriately represent the under-represented class.
