In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import time as time
import gc
import warnings
warnings.filterwarnings("ignore")

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
path = os.getcwd()
data = path + '/dataset/bank/bank-additional/bank-additional-full.csv'
df_orig = pd.read_csv(data, sep=';')
print(f'size of the data --> {df_orig.shape}')
df_orig.head()

size of the data --> (41188, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
# create table listing column types etc
df = df_orig.copy()        # make a copy of the data before any transformations

df_info = pd.DataFrame([df.columns,df.dtypes, df.isna().sum(), df.nunique()])
df_info = df_info.T
df_info.columns = ['column','dtype','NA_count', 'unique_count']
print(f'Duplicates dropped --> {df.duplicated().sum()}')
# drop duplicates, 12 rows
df = df.drop_duplicates()
df_info

Duplicates dropped --> 12


Unnamed: 0,column,dtype,NA_count,unique_count
0,age,int64,0,78
1,job,object,0,12
2,marital,object,0,4
3,education,object,0,8
4,default,object,0,3
5,housing,object,0,3
6,loan,object,0,3
7,contact,object,0,2
8,month,object,0,10
9,day_of_week,object,0,5


In [4]:
'''pipeline for preprocessing'''
# Drop response variable from the dataframe
y = df['y']
df = df.drop('y', axis=1)
# drop duration as it is not known before a call is performed
df.drop('duration', axis=1)

# Create a pipeline for categorical features
cat_features = df.select_dtypes(include=['object']).columns
cat_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# df_cat= cat_pipeline.fit_transform(df[cat_features])

# Create a pipeline for numerical features
num_features = df.select_dtypes(include=['int64', 'float64']).columns
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

# Create a column transformer
preprocessor = ColumnTransformer([
        ('cat', cat_pipeline, cat_features),
        ('num', num_pipeline, num_features)
])

In [5]:
'''Linear Regression'''

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Fit the logistic regression model
clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42))
])

clf.fit(X_train, y_train)
# Get the accuracy score
accuracy = clf.score(X_test, y_test)
print('Accuracy: ', accuracy)

Accuracy:  0.9059009227780476


In [6]:
# parameter grid
parameters = {
    'classifier__penalty' : ['l2', 'none'], 
    'classifier__C'       : np.logspace(-3,3,5),
    'classifier__solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf_grid = GridSearchCV(clf,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=cv)                     # number of folds

start_time = time.time()
clf_grid.fit(X_train,y_train)
print("Tuned Hyperparameters :", clf_grid.best_params_)
print("Accuracy :",clf_grid.best_score_)
print('Training Time :', time.time() - start_time)
print("Test Accuracy :",clf_grid.score(X_test,y_test))


Tuned Hyperparameters : {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
Accuracy : 0.912446873102611


In [8]:
df_param = pd.DataFrame(clf_grid.cv_results_)
df_param[['param_classifier__C','param_classifier__penalty','param_classifier__solver','mean_test_score']]

Unnamed: 0,param_classifier__C,param_classifier__penalty,param_classifier__solver,mean_test_score
0,0.001,l2,newton-cg,0.908622
1,0.001,l2,lbfgs,0.908622
2,0.001,l2,liblinear,0.90762
3,0.001,none,newton-cg,0.912234
4,0.001,none,lbfgs,0.912295
5,0.001,none,liblinear,
6,0.031623,l2,newton-cg,0.912356
7,0.031623,l2,lbfgs,0.912356
8,0.031623,l2,liblinear,0.912417
9,0.031623,none,newton-cg,0.912234


In [9]:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__cat', 'preprocessor__num', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__onehot', 'preprocessor__cat__onehot__categories', 'preprocessor__cat__onehot__drop', 'preprocessor__cat__onehot__dtype', 'preprocessor__cat__onehot__handle_unknown', 'preprocessor__cat__onehot__sparse', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__std_scaler', 'preprocessor__num__std_scaler__copy', 'preprocessor__num__std_scaler__with_mean', 'preprocessor__num__std_scaler__with_std', 'classifier__C', 'classifier__class_weight', 'classifier__dual', 'classifier__fit_intercept', 'classifier__inte