## EDA

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from scipy.stats import loguniform, randint, uniform
from sklearn.impute import SimpleImputer

In [27]:
## Download Data -- NEEDS ATTRIBUTION FOR DOWNLOAD CODE from UCI ML github repo

## Uncomment and Run to install neccessary packages
#!pip3 install -U ucimlrepo 

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 

bank_marketing_data =X; bank_marketing_data['y'] = y
bank_marketing_data.to_csv('data/bank_marketing.csv')

bank_marketing_sample = bank_marketing_data.sample(4000, random_state=522)
bank_marketing_sample.to_csv('data/bank_marketing_small.csv')

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

### Model Building and Evaluation

In [28]:
bank_marketing_sample.isnull().sum() / bank_marketing_sample.shape[0] * 100

age             0.000
job             0.775
marital         0.000
education       3.700
default         0.000
balance         0.000
housing         0.000
loan            0.000
contact        29.600
day_of_week     0.000
month           0.000
duration        0.000
campaign        0.000
pdays           0.000
previous        0.000
poutcome       83.525
y               0.000
dtype: float64

In [29]:
# preprocessing
#bank_marketing_sample = bank_marketing_sample.copy()

# map the target variable to numeric
bank_marketing_sample['y'] = bank_marketing_sample['y'].map({'yes': 1, 'no': 0})


# feature engineering on 'pdays' column into categorical determining if clinet was contacted before or not
bank_marketing_sample['pdays_contacted'] = bank_marketing_sample['pdays'].apply(lambda x: 'never' if x == -1 else 'contacted')

# dropping columns
bank_marketing_sample= bank_marketing_sample.drop(columns=['day_of_week', 'pdays', 'poutcome'])


# split data
X_train, X_test, y_train, y_test = train_test_split(bank_marketing_sample.drop(columns='y'), bank_marketing_sample['y'], test_size=0.2, random_state=522)

In [30]:
#df.select_dtypes(include=['object']).nunique()

bank_marketing_sample.select_dtypes(include=['number']).columns

Index(['age', 'balance', 'duration', 'campaign', 'previous', 'y'], dtype='object')

In [38]:
bank_marketing_sample.select_dtypes(include=['object']).columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'pdays_contacted'],
      dtype='object')

In [41]:
bank_marketing_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 30577 to 9839
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              4000 non-null   int64 
 1   job              3969 non-null   object
 2   marital          4000 non-null   object
 3   education        3852 non-null   object
 4   default          4000 non-null   object
 5   balance          4000 non-null   int64 
 6   housing          4000 non-null   object
 7   loan             4000 non-null   object
 8   contact          2816 non-null   object
 9   month            4000 non-null   object
 10  duration         4000 non-null   int64 
 11  campaign         4000 non-null   int64 
 12  previous         4000 non-null   int64 
 13  y                4000 non-null   int64 
 14  pdays_contacted  4000 non-null   object
dtypes: int64(6), object(9)
memory usage: 500.0+ KB


In [None]:
# separating columns by type of transformation required

# One-hot encoding
categorical_cols = ['job', 'marital', 'default', 'housing', 'loan', 'contact','month', 'pdays_contacted']
# Ordinal encoding
ordinal_cols = ['education']
# Standard scaling
numerical_cols = ['age', 'balance', 'duration', 'campaign', 'previous']



We decided to train both a Logistic Regression and Support Vector Classifier (SVC) to determine which was more efficient in predicting if a customer would subscribed to the banks offering of term investments. We performed hyperparameter tuning using RandomizedSearchCV to find the best parameters for each model. The models were evaluated based on their accuracy on the test set.

In [46]:
# defining the preprocessor

data_preprocessor = make_column_transformer(
    (
        make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), categorical_cols
    ), (
        make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(categories=[['unknown', 'primary', 'secondary', 'tertiary']], dtype=object)), ordinal_cols
    ), (StandardScaler(), numerical_cols))

In [47]:
# Logistic Regression cross-validation with RandomizedSearchCV 
lr_pipe = make_pipeline(data_preprocessor, LogisticRegression(random_state=42, max_iter=1000))
param_dist1 = {"logisticregression__C": loguniform(1e-4, 1e3)} 
random_lr = RandomizedSearchCV(lr_pipe, param_distributions=param_dist1,
                                n_iter=100, n_jobs=-1, return_train_score=True, random_state=522)

In [48]:
# Fit model
random_lr.fit(X_train, y_train)
print(f'Train Score: {random_lr.score(X_train, y_train)}')
print(f'Test Score: {random_lr.score(X_test, y_test)}')

Train Score: 0.905625
Test Score: 0.90375


In [49]:
# SVC cross-validation with RandomizedSearchCV 
svc_pipe = make_pipeline(data_preprocessor, SVC(random_state=42))
param_dist = { "svc__C": loguniform(1e-2, 1e3), "svc__gamma": loguniform(1e-2, 1e3)}
random_svc = RandomizedSearchCV(svc_pipe, param_distributions=param_dist,
                                n_iter=100, n_jobs=-1, return_train_score=True, random_state=522)


In [50]:
# Fit model
random_svc.fit(X_train, y_train)
print(f'Train Score: {random_svc.score(X_train, y_train)}')
print(f'Test Score: {random_svc.score(X_test, y_test)}')

Train Score: 0.919375
Test Score: 0.90125
