# C. Data Pre-processing

# Load Packages

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report, fbeta_score
from sklearn.neighbors import KNeighborsClassifier
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
data = pd.read_csv("../data/interim/bank_modeling.csv")

In [3]:
bank = data.copy()

# Inspect Data

In [4]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contactType,month,dayOfWeek,...,employmentRate_grp,employmentRate_grp_rr,cpi_grp,cpi_grp_rr,cci_grp,cci_grp_rr,euribor3m_grp,euribor3m_grp_rr,noEmployed_grp,noEmployed_grp_rr
0,56,housemaid,married,basic_4y,no,no,no,telephone,may,mon,...,0.1_1.5,0.046089,93.001_94,0.074241,neg40.9_neg36,0.099414,4.201_5.1,0.048503,5140.1_5230,0.048357
1,57,services,married,high_school,unknown,no,no,telephone,may,mon,...,0.1_1.5,0.046089,93.001_94,0.074241,neg40.9_neg36,0.099414,4.201_5.1,0.048503,5140.1_5230,0.048357
2,37,services,married,high_school,no,yes,no,telephone,may,mon,...,0.1_1.5,0.046089,93.001_94,0.074241,neg40.9_neg36,0.099414,4.201_5.1,0.048503,5140.1_5230,0.048357
3,40,admin.,married,basic_6y,no,no,no,telephone,may,mon,...,0.1_1.5,0.046089,93.001_94,0.074241,neg40.9_neg36,0.099414,4.201_5.1,0.048503,5140.1_5230,0.048357
4,56,services,married,high_school,no,no,yes,telephone,may,mon,...,0.1_1.5,0.046089,93.001_94,0.074241,neg40.9_neg36,0.099414,4.201_5.1,0.048503,5140.1_5230,0.048357


In [5]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 51 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              41188 non-null  int64  
 1   job                              41188 non-null  object 
 2   marital                          41188 non-null  object 
 3   education                        41188 non-null  object 
 4   default                          41188 non-null  object 
 5   housing                          41188 non-null  object 
 6   loan                             41188 non-null  object 
 7   contactType                      41188 non-null  object 
 8   month                            41188 non-null  object 
 9   dayOfWeek                        41188 non-null  object 
 10  duration                         41188 non-null  int64  
 11  currentCampaignContacts          41188 non-null  int64  
 12  daysLastContacted 

# Feature Selection

As noted during EDA we will be dropping these features and for the following reasons:
- 'duration' - The duration is not known before the call is made. Including this feature in our model may lead to data leakage. 
- 'daysLastContacted' - We observed that most of the values were '999', which was a placeholder to note that the information was missing. Over 96% of the values were '999'. We will consider removing this feature as we tune our model.
- We also noted that 'cci' and 'currentCampaignContacts' and age have a low correlation with response. We will include these features in our base model, but will consider removing these features as we tune our model.

In [6]:
bank.drop(columns = ['duration', 'duration_grp', 'duration_grp_rr'], inplace = True)

# Encoding the response

In [7]:
#Converting the subscribed variable from a boolean feature to numerical feature
bank.response = bank.response.replace({'yes': 1, 'no': 0})

# Application set development

An application set will be used to observe the performance of the model. A random sample of 20 clients will be selected. We will first of all obtain the index.

In [8]:
#Create application set of observations
application_set = bank.sample(20, random_state = 77)
application_set_index = application_set.index
application_set[['age', 'job', 'marital', 'education', 'response']]

Unnamed: 0,age,job,marital,education,response
11994,39,blue-collar,married,basic_4y,0
39824,29,unemployed,single,university_degree,0
10278,50,blue-collar,single,basic_9y,0
29948,40,blue-collar,married,basic_9y,0
13012,34,technician,single,university_degree,0
23403,29,admin.,single,university_degree,0
40112,28,student,single,high_school,1
24566,30,admin.,single,university_degree,0
38540,54,retired,married,basic_4y,1
29175,43,blue-collar,married,basic_6y,0


# Encoding Categorical Variables

In [9]:
#Encode cateogrical features
bank_df = pd.get_dummies(bank)

In [10]:
#Extracting application set samples from encoded bank dataframe
application_df = bank_df.loc[application_set_index]
#Drop values in the application set from dataframe
bank_df = bank_df.drop(application_set_index)

In [11]:
bank_df.shape

(41168, 130)

In [12]:
application_df.shape

(20, 130)

# Splitting Features From Target

In [13]:
X = bank_df.drop(columns = ['response'])
y = bank_df.response

In [14]:
feature_names = X.columns

# Scaling the Data

We will use a standard scaler to scale out features

In [15]:
scaler = StandardScaler()

In [16]:
X_scaled = scaler.fit_transform(X)

# Creating Train / Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, test_size = 0.3, random_state = 77)

In [18]:
X_train.shape, X_test.shape

((28817, 129), (12351, 129))

In [19]:
y_train.shape, y_test.shape

((28817,), (12351,))

In [20]:
y_train.value_counts()

0    25572
1     3245
Name: response, dtype: int64

# Baseline Model - Logistic Regression

The baseline algorithm that we shall use to create a model is logistic regression.

In [21]:
logreg = LogisticRegression(random_state = 77)

In [22]:
logreg.fit(X_train, y_train);

In [23]:
#Determine predictions for train and the test set
y_train_pred_lg = logreg.predict(X_train)
y_test_pred_lg= logreg.predict(X_test)

# Model Assessment

## Imbalanced Dataset

In [24]:
#Get the number of negative and positive responses
y.value_counts()

0    36532
1     4636
Name: response, dtype: int64

In [25]:
#Get the ratio of positive to negative responses
y.value_counts()[0] / y.value_counts()[1]

7.88006902502157

Our data is imbalanced. The ratio of positive responses to negative responses is roughly 1 to 8. It is therefore very likely that the typical metrics will not yield favorable outcomes. 

## Metrics

It is important that our model is able to prioritize the prediction of positive responses, in this case, a 'yes' indicating that a potential client will accept a marketing request to open a term-deposit account. Our model must find these clients more often than nought. 

Our metric of relevance will therefore be recall. Recall is the proportion of relevant cases that is selected by the model. We define our positive case as the case related to clients accepting a request to open a term-deposit account (i.e. the 'yes' or '1' response value). Relevant cases are actual positive cases (predicted or not). The selected cases are the positive cases that were appropriately predicted by the model. 

For this project, we want to err on the side of predicting positive cases, meaning that we will favor a probability threshold for selecting positive cases. Accordingly, it is very likely that our precision suffers and we get a higher false positive rate. 

In reality, there are costs associated with making wrong predictions. This means that we will be contacting clients that are unlikely to favorably respond to the marketing campaign. There should therefore be a cost / benefit analysis done to determine the trade-off between precision and recall when choosing a model.

We shall use a classification report to review model performance, with a focus on the positive case's recall. Furthermore, it should be noted that the classification report will change based on the probability threshold for predicting the positive case - the default probability being 0.5. The Area Under the Curve (AUC) gives us a metric for all such thresholds for the positive case from 0 to 1. 

In [26]:
#Accuracy scores
logreg.score(X_test, y_test)

0.9031657355679702

In [27]:
#Check confusion matrix
confusion_matrix(y_test, y_test_pred_lg)

array([[10806,   154],
       [ 1042,   349]], dtype=int64)

In [28]:
#Check classification report
print(classification_report(y_test, y_test_pred_lg))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10960
           1       0.69      0.25      0.37      1391

    accuracy                           0.90     12351
   macro avg       0.80      0.62      0.66     12351
weighted avg       0.89      0.90      0.88     12351



In [29]:
#Check area under the curve
y_pred_prob_logreg = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob_logreg)

0.8065633084427

In [30]:
#Determine area under precision-recall curve
average_precision_score(y_test, y_pred_prob_logreg)

0.4760023086498295

In [31]:
#F2 score
fbeta_score(y_test, y_test_pred_lg, beta = 2)

0.28762155925498595

# Summary

Our base model gives a good accuracy of 90%, but this score is meaningless in light of the unbalanced nature of the labels. The more reflective performance metric are the recall and precision scores of 0.23 and 0.64 respectively. Our work we turn to optimizing the F1 score for the class of positive responses ('1').  

# Saving

In [32]:
import pickle

In [33]:
filepath = '../data/interim/'

In [34]:
pickle.dump(logreg, open(filepath + 'logreg_base.pkl', 'wb'))
pickle.dump(X_test, open(filepath + 'X_test_base.pkl', 'wb'))
pickle.dump(y_test, open(filepath + 'y_test_base.pkl', 'wb'))
pickle.dump(X_train, open(filepath + 'X_train_base.pkl', 'wb'))
pickle.dump(y_train, open(filepath + 'y_train_base.pkl', 'wb'))
pickle.dump(feature_names, open(filepath + 'feature_names_base.pkl', 'wb'))
pickle.dump(application_df, open(filepath + 'application_df.pkl', 'wb'))
pickle.dump(X, open(filepath + 'X.pkl', 'wb'))
pickle.dump(X_scaled, open(filepath + 'X_scaled.pkl', 'wb'))
pickle.dump(y, open(filepath + 'y.pkl', 'wb'))
pickle.dump(logreg, open('../models/logreg.pkl', 'wb'))

# Extended Modelling Plan

We will extend our model by doing the following:
1. Feature selection: As noted 'cci' and 'currentCampaignContacts' were show to have a lower correlation coefficient than the other features. We will drop these features and see if performance improves.
2. Algorithms: We will trial tree-based algorithms (e.g. Random Forest, XGBoost) and nearest neigbhour models (e.g. kNN) to see if performance improves.
3. Hyper-parameter tuning: We will tune hyperparamaters appropriately for all models.
4. Resampling methods: We will use sampling methods to appropriately represent the under-represented class.
