In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In this bank marketing modeling we need to predict wether the prospect will deposit or no for next marketing campaign

In [2]:
df = pd.read_csv('data_bank_marketing_campaign.csv')
df

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
0,55,admin.,1662,no,no,cellular,jun,2,-1,unknown,yes
1,39,self-employed,-3058,yes,yes,cellular,apr,3,-1,unknown,yes
2,51,admin.,3025,no,no,cellular,may,1,352,other,yes
3,38,services,-87,yes,no,cellular,may,1,-1,unknown,no
4,36,housemaid,205,yes,no,telephone,nov,4,-1,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,cellular,nov,1,-1,unknown,yes
7809,48,housemaid,5473,no,no,cellular,feb,2,184,success,yes
7810,36,unknown,179,no,no,cellular,aug,8,-1,unknown,no
7811,31,admin.,54,yes,no,cellular,nov,1,-1,unknown,no


From business perspective we sure predict 'positif' ('yes') in deposit while the outcome 'false' are more important 

so since FP > FN we'll use precision as the scoring method

In [3]:
# Look at imbalance of the target ('deposit')

display(
    df['deposit'].value_counts(),
    df['deposit'].value_counts()['no']/len(df),
    df['deposit'].value_counts()['yes']/len(df)
    )

# closer to 1 mean the data are more balance

no     4081
yes    3732
Name: deposit, dtype: int64

0.5223345705874824

0.4776654294125176

In [5]:
for i in df.describe(include='O').columns:
    print(i,len(df[i].unique()),df[i].unique(),'\n')

job 12 ['admin.' 'self-employed' 'services' 'housemaid' 'technician' 'management'
 'student' 'blue-collar' 'entrepreneur' 'retired' 'unemployed' 'unknown'] 

housing 2 ['no' 'yes'] 

loan 2 ['no' 'yes'] 

contact 3 ['cellular' 'telephone' 'unknown'] 

month 12 ['jun' 'apr' 'may' 'nov' 'jan' 'sep' 'feb' 'mar' 'aug' 'jul' 'oct' 'dec'] 

poutcome 4 ['unknown' 'other' 'failure' 'success'] 

deposit 2 ['yes' 'no'] 



In [6]:
# count how much 'amount' in each columns
# convert if it's too much and have a good reason

for i in df.describe(include='O').columns:
    print(i,len(df[df[i]=='unknown'])/len(df),len(df[df[i]=='unknown']))

job 0.0069115576603097405 54
housing 0.0 0
loan 0.0 0
contact 0.20977857417125303 1639
month 0.0 0
poutcome 0.7447843338026366 5819
deposit 0.0 0


In [7]:
# convert value -1 in 'pdays' to nan

display(
df.describe(),
df[df['balance']<0]
)

Unnamed: 0,age,balance,campaign,pdays
count,7813.0,7813.0,7813.0,7813.0
mean,41.257264,1512.448611,2.519775,51.40855
std,11.91971,3089.291831,2.727001,108.072739
min,18.0,-6847.0,1.0,-1.0
25%,32.0,127.0,1.0,-1.0
50%,39.0,549.0,2.0,-1.0
75%,49.0,1684.0,3.0,40.0
max,95.0,66653.0,63.0,854.0


Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
1,39,self-employed,-3058,yes,yes,cellular,apr,3,-1,unknown,yes
3,38,services,-87,yes,no,cellular,may,1,-1,unknown,no
5,41,admin.,-76,yes,no,cellular,apr,1,-1,unknown,no
59,50,admin.,-194,yes,no,cellular,apr,2,-1,unknown,no
65,31,services,-327,yes,no,unknown,may,1,-1,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...
7712,40,management,-63,no,yes,cellular,jul,1,-1,unknown,no
7757,33,technician,-479,no,no,cellular,aug,1,-1,unknown,no
7758,57,retired,-157,no,no,cellular,aug,9,-1,unknown,no
7763,28,management,-994,yes,yes,cellular,jul,2,-1,unknown,no


In [8]:
# convert value -1 in 'pdays' to nan

df['pdays'].replace(-1,np.nan,inplace=True)
df.isna().sum()

age            0
job            0
balance        0
housing        0
loan           0
contact        0
month          0
campaign       0
pdays       5817
poutcome       0
deposit        0
dtype: int64

In [9]:
# Fill 'pdays' nan with median by month

df['pdays']=df['pdays'].fillna(df.groupby('month')['pdays'].transform('median'))
df

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
0,55,admin.,1662,no,no,cellular,jun,2,104.0,unknown,yes
1,39,self-employed,-3058,yes,yes,cellular,apr,3,237.0,unknown,yes
2,51,admin.,3025,no,no,cellular,may,1,352.0,other,yes
3,38,services,-87,yes,no,cellular,may,1,305.5,unknown,no
4,36,housemaid,205,yes,no,telephone,nov,4,146.0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,cellular,nov,1,146.0,unknown,yes
7809,48,housemaid,5473,no,no,cellular,feb,2,184.0,success,yes
7810,36,unknown,179,no,no,cellular,aug,8,130.0,unknown,no
7811,31,admin.,54,yes,no,cellular,nov,1,146.0,unknown,no


In [10]:
# Convert 'deposit' to numeric
# The objective bisnis is 'yes' so it become '1'

df['deposit'] = np.where(df['deposit']=='yes',1,0)
df

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
0,55,admin.,1662,no,no,cellular,jun,2,104.0,unknown,1
1,39,self-employed,-3058,yes,yes,cellular,apr,3,237.0,unknown,1
2,51,admin.,3025,no,no,cellular,may,1,352.0,other,1
3,38,services,-87,yes,no,cellular,may,1,305.5,unknown,0
4,36,housemaid,205,yes,no,telephone,nov,4,146.0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,cellular,nov,1,146.0,unknown,1
7809,48,housemaid,5473,no,no,cellular,feb,2,184.0,success,1
7810,36,unknown,179,no,no,cellular,aug,8,130.0,unknown,0
7811,31,admin.,54,yes,no,cellular,nov,1,146.0,unknown,0


In [11]:
# Split learning and target dataset (X,y)

X = df.drop(columns='deposit')
y = df['deposit']

display(X,y)

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome
0,55,admin.,1662,no,no,cellular,jun,2,104.0,unknown
1,39,self-employed,-3058,yes,yes,cellular,apr,3,237.0,unknown
2,51,admin.,3025,no,no,cellular,may,1,352.0,other
3,38,services,-87,yes,no,cellular,may,1,305.5,unknown
4,36,housemaid,205,yes,no,telephone,nov,4,146.0,unknown
...,...,...,...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,cellular,nov,1,146.0,unknown
7809,48,housemaid,5473,no,no,cellular,feb,2,184.0,success
7810,36,unknown,179,no,no,cellular,aug,8,130.0,unknown
7811,31,admin.,54,yes,no,cellular,nov,1,146.0,unknown


0       1
1       1
2       1
3       0
4       0
       ..
7808    1
7809    1
7810    0
7811    0
7812    1
Name: deposit, Length: 7813, dtype: int32

In [12]:
# Split data set to train and test

from sklearn.model_selection import train_test_split

X_tr,X_ts,y_tr,y_ts = train_test_split(X,y,train_size=0.8,stratify=y,random_state=10)
X_tr

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome
4843,46,blue-collar,229,yes,no,cellular,apr,1,237.0,unknown
7500,25,services,0,yes,no,cellular,jul,9,184.0,unknown
5961,37,admin.,796,yes,no,telephone,jan,3,194.0,unknown
2126,39,technician,393,no,no,cellular,jun,1,94.0,success
2301,33,admin.,640,no,no,cellular,jul,1,184.0,unknown
...,...,...,...,...,...,...,...,...,...,...
963,28,admin.,352,yes,yes,cellular,may,1,305.5,unknown
6053,47,services,1717,no,no,cellular,aug,9,130.0,unknown
7212,28,blue-collar,3817,yes,no,cellular,may,3,305.5,unknown
5746,30,technician,-522,yes,yes,cellular,may,2,286.0,failure


<br> • job will encoded with Binary
<br> • month will encoded with Ordinal
<br> • housing, loan, contact, poutcome will encoded with OneHot
<br> • pdays will get IterativeImputer by month

In [13]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

import category_encoders as ce

from sklearn.compose import ColumnTransformer

In [14]:
df['month'].unique()

array(['jun', 'apr', 'may', 'nov', 'jan', 'sep', 'feb', 'mar', 'aug',
       'jul', 'oct', 'dec'], dtype=object)

In [15]:
# Mapping for ordinal encoding

ordinal_map = [{
    'col':'month',
    'mapping':{'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nove':11,'dec':12,}
    }]

In [16]:
transformer = ColumnTransformer([
    ('binary',ce.BinaryEncoder(),['job']),
    ('ordinal',ce.OrdinalEncoder(mapping=ordinal_map),['month']),
    ('onehot',OneHotEncoder(drop='first'),['housing','loan','contact','poutcome']),
    # ('iterative',IterativeImputer(),[]),
],remainder='passthrough')

In [17]:
# Transformed DataFrame

pd.DataFrame(data=transformer.fit_transform(X_tr))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,46.0,229.0,1.0,237.0
1,0.0,0.0,1.0,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,25.0,0.0,9.0,184.0
2,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,37.0,796.0,3.0,194.0
3,0.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,39.0,393.0,1.0,94.0
4,0.0,0.0,1.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33.0,640.0,1.0,184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6245,0.0,0.0,1.0,1.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,28.0,352.0,1.0,305.5
6246,0.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,47.0,1717.0,9.0,130.0
6247,0.0,0.0,0.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,28.0,3817.0,3.0,305.5
6248,0.0,1.0,0.0,0.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,30.0,-522.0,2.0,286.0


In [19]:
# Define scaler

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

scaler = RobustScaler()

In [132]:
# List used models

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

logreg = LogisticRegression(random_state=10)#(solver='liblinear',penalty='elasticnet',C=1,random_state=10)
ridge = RidgeClassifier(random_state=10)#(alpha=0.5,random_state=10)
knn = KNeighborsClassifier()#(n_neighbors=5,n_jobs=-1)
tree = DecisionTreeClassifier(random_state=10,criterion='entropy',splitter='best',max_depth=1)#(max_depth=5,criterion='gini',splitter='best',min_samples_leaf=1)

list_model = [logreg,ridge,knn,tree]

In [22]:
# import Pipeline, cross_val_score, GridSearchCV, and RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

In [133]:
# Cross val all the list model and pick one the best

for i in list_model:

    pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',i),
    ])

    cvs = cross_val_score(
        estimator=pipe,
        X=X_tr,
        y=y_tr,
        cv = 5,
        scoring='precision',
        n_jobs=-1
    )

    print(i,cvs,cvs.mean(),cvs.std())

LogisticRegression(random_state=10) [0.67790262 0.69444444 0.69512195 0.64325323 0.68207024] 0.6785584984866028 0.018899393629175372
RidgeClassifier(random_state=10) [0.67790262 0.68773234 0.69270833 0.64338235 0.68508287] 0.6773617045865936 0.01764978472704459
KNeighborsClassifier() [0.65120594 0.669627   0.66959578 0.65804067 0.68007313] 0.6657085017589629 0.010059106295059099
DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=10) [0.8852459  0.92307692 0.89655172 0.90677966 0.92682927] 0.9076966956327661 0.01569519815652774


In [56]:
# define best model and find best params

best_model = KNeighborsClassifier()

hyper_param = {
    'model__n_neighbors':range(5,100,2),
    'model__metric':['euclidean', 'manhattan', 'minkowski'],
    }

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',best_model),
    ])

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=hyper_param,
    cv=5,
    n_jobs=-1,
    scoring='precision'
)

grid_search.fit(X_tr,y_tr)

display(
    grid_search.best_score_,
    grid_search.best_params_
    )

0.7553025302755763

{'model__metric': 'manhattan', 'model__n_neighbors': 71}

In [57]:
pd.DataFrame(data=grid_search.cv_results_).sort_values(by='mean_test_score',ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__metric,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
81,0.066959,0.003836,0.326902,0.005772,manhattan,71,"{'model__metric': 'manhattan', 'model__n_neigh...",0.760369,0.762222,0.759062,0.737557,0.757303,0.755303,0.009018,1
77,0.064474,0.003665,0.333779,0.015878,manhattan,63,"{'model__metric': 'manhattan', 'model__n_neigh...",0.756944,0.763636,0.763043,0.733032,0.755605,0.754452,0.011176,2
86,0.077983,0.020244,0.392711,0.047348,manhattan,81,"{'model__metric': 'manhattan', 'model__n_neigh...",0.75576,0.765101,0.758475,0.740406,0.749431,0.753834,0.008389,3
79,0.067349,0.007587,0.321131,0.021417,manhattan,67,"{'model__metric': 'manhattan', 'model__n_neigh...",0.76092,0.757303,0.762931,0.730512,0.757303,0.753794,0.01184,4
88,0.065299,0.003485,0.346126,0.018419,manhattan,85,"{'model__metric': 'manhattan', 'model__n_neigh...",0.7669,0.759551,0.758547,0.735763,0.747685,0.753689,0.010862,5


In [58]:
# modeling with best model & params

final_model = KNeighborsClassifier(metric='manhattan',n_neighbors=71)

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',final_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.7482517482517482

In [52]:
# benchmark best model without best params

original_model = KNeighborsClassifier()

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',original_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.6817518248175183

In [60]:
# find alternate best model and find best params (LogisticRegression)

best_model = LogisticRegression(random_state=10)

hyper_param = {
    'model__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'model__penalty':['none', 'l1', 'l2', 'elasticnet'],
    'model__C':[100, 10, 1.0, 0.1, 0.01],
    }

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',best_model),
    ])

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=hyper_param,
    cv=5,
    n_jobs=-1,
    scoring='precision'
)

grid_search.fit(X_tr,y_tr)

display(
    grid_search.best_score_,
    grid_search.best_params_
    )

warnings.filterwarnings('ignore')

0.6798953748879828

{'model__C': 0.01, 'model__penalty': 'l1', 'model__solver': 'liblinear'}

In [59]:
pd.DataFrame(data=grid_search.cv_results_).sort_values(by='mean_test_score',ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__metric,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
81,0.066959,0.003836,0.326902,0.005772,manhattan,71,"{'model__metric': 'manhattan', 'model__n_neigh...",0.760369,0.762222,0.759062,0.737557,0.757303,0.755303,0.009018,1
77,0.064474,0.003665,0.333779,0.015878,manhattan,63,"{'model__metric': 'manhattan', 'model__n_neigh...",0.756944,0.763636,0.763043,0.733032,0.755605,0.754452,0.011176,2
86,0.077983,0.020244,0.392711,0.047348,manhattan,81,"{'model__metric': 'manhattan', 'model__n_neigh...",0.75576,0.765101,0.758475,0.740406,0.749431,0.753834,0.008389,3
79,0.067349,0.007587,0.321131,0.021417,manhattan,67,"{'model__metric': 'manhattan', 'model__n_neigh...",0.76092,0.757303,0.762931,0.730512,0.757303,0.753794,0.01184,4
88,0.065299,0.003485,0.346126,0.018419,manhattan,85,"{'model__metric': 'manhattan', 'model__n_neigh...",0.7669,0.759551,0.758547,0.735763,0.747685,0.753689,0.010862,5


In [61]:
# modeling with best model & params

final_model = LogisticRegression(random_state=10,C=0.01,penalty='l1',solver='liblinear')

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',final_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.692063492063492

In [62]:
# benchmark best model without best params

original_model = LogisticRegression(random_state=10)

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',original_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.6805755395683454

In [87]:
# find alternate best model and find best params (DecisionTreeClassifier)

best_model = DecisionTreeClassifier(random_state=10)

hyper_param = {
    'model__criterion':['gini','entropy'],
    'model__splitter':['best','random'],
    'model__max_depth':range(1,101),
    }

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',best_model),
    ])

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=hyper_param,
    cv=5,
    n_jobs=-1,
    scoring='precision'
)

grid_search.fit(X_tr,y_tr)

display(
    grid_search.best_score_,
    grid_search.best_params_
    )

warnings.filterwarnings('ignore')

0.9076966956327661

{'model__criterion': 'entropy',
 'model__max_depth': 1,
 'model__splitter': 'best'}

In [88]:
pd.DataFrame(data=grid_search.cv_results_).sort_values(by='mean_test_score',ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
200,0.077172,0.006875,0.019187,0.006396,entropy,1,best,"{'model__criterion': 'entropy', 'model__max_de...",0.885246,0.923077,0.896552,0.90678,0.926829,0.907697,0.015695,1
201,0.070806,0.004128,0.019189,0.003914,entropy,1,random,"{'model__criterion': 'entropy', 'model__max_de...",0.885246,0.923077,0.896552,0.90678,0.926829,0.907697,0.015695,1
2,0.083226,0.002595,0.022385,0.003197,gini,2,best,"{'model__criterion': 'gini', 'model__max_depth...",0.885246,0.923077,0.896552,0.532587,0.926829,0.832858,0.150953,3
3,0.083957,0.007999,0.017085,0.001227,gini,2,random,"{'model__criterion': 'gini', 'model__max_depth...",0.885246,0.923077,0.896552,0.531633,0.926829,0.832667,0.151333,4
202,0.074462,0.008471,0.019755,0.003555,entropy,2,best,"{'model__criterion': 'entropy', 'model__max_de...",0.885246,0.923077,0.896552,0.530675,0.926829,0.832476,0.151714,5


In [126]:
# modeling with best model & params

final_model = DecisionTreeClassifier(random_state=10,criterion='entropy',splitter='best',max_depth=1)

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',final_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.9393939393939394

In [128]:
from sklearn.metrics import accuracy_score

accuracy_score(y_ts,y_pred)

0.6148432501599488

In [129]:
from sklearn.metrics import recall_score

recall_score(y_ts,y_pred)

0.2074966532797858

In [123]:
a = pd.DataFrame(data={'y_ts':y_ts,'y_pred':y_pred})
display(a['y_ts'].value_counts(),a['y_pred'].value_counts())

0    816
1    747
Name: y_ts, dtype: int64

0    1398
1     165
Name: y_pred, dtype: int64

In [124]:
confusion_matrix(y_ts,y_pred)

array([[806,  10],
       [592, 155]], dtype=int64)

In [131]:
df['deposit'].value_counts()

0    4081
1    3732
Name: deposit, dtype: int64

In [77]:
# benchmark best model without best params

original_model = DecisionTreeClassifier(random_state=10)

pipe = Pipeline([
        ('transformer',transformer),
        ('scaler',scaler),
        ('model',original_model),
    ])

pipe_fit = pipe.fit(X_tr,y_tr)

y_pred = pipe_fit.predict(X_ts)

precision_score(y_ts,y_pred)

0.6408544726301736

In [103]:
from sklearn import tree

Ambil Coef nya