In [69]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression


In [70]:
RANDOM_SEED = 42

In [71]:
df = pd.read_csv('./data/bank.csv', sep = ';')
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['response']), df['response'], test_size=0.2, random_state=RANDOM_SEED, stratify=df['response'])

In [72]:
print(X_train.shape)
X_train.isnull().sum()

(3616, 16)


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

In [73]:
X_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
2236,32,admin.,divorced,secondary,yes,-18,yes,no,unknown,12,may,155,3,-1,0,unknown
858,34,technician,single,tertiary,no,144,yes,yes,cellular,14,oct,104,1,85,19,failure
3531,44,management,divorced,tertiary,no,0,no,no,cellular,22,aug,54,2,-1,0,unknown
2737,34,blue-collar,married,secondary,no,8309,yes,yes,cellular,19,nov,50,1,-1,0,unknown
1257,51,blue-collar,married,primary,no,5050,no,yes,unknown,16,jun,75,7,-1,0,unknown


In [74]:
X_train.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0
mean,41.043971,1403.934458,16.040929,263.769358,2.795354,40.218473,0.545907
std,10.4648,2811.286428,8.207003,260.675975,3.144846,100.170192,1.657239
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,65.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,450.5,16.0,185.0,2.0,-1.0,0.0
75%,48.0,1477.75,21.0,330.0,3.0,-1.0,0.0
max,87.0,42045.0,31.0,3025.0,50.0,871.0,24.0


In [163]:
binary_features = ['default', 'housing', 'loan']

binary_transformer = OneHotEncoder(drop='if_binary', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[('bin',binary_transformer, binary_features)]
)

clf = Pipeline(
    steps=[('preprocessor', preprocessor),
    ('classifier', BernoulliNB())]
)

In [164]:
clf.fit(X_train[binary_features], y_train.ravel())

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('bin',
                                                  OneHotEncoder(drop='if_binary',
                                                                sparse=False),
                                                  ['default', 'housing',
                                                   'loan'])])),
                ('classifier', LogisticRegression())])

In [166]:
y_pred = clf.predict_proba(X_test[binary_features])

In [167]:
roc_auc_score(y_test, (y_pred[:, 1:] > 0.1))

0.5

In [168]:
y_test.mean()

0.11491712707182321

In [169]:
y_pred.mean()

0.5000000000000001

In [170]:
y_pred

array([[0.83300458, 0.16699542],
       [0.83300458, 0.16699542],
       [0.83300458, 0.16699542],
       ...,
       [0.83300458, 0.16699542],
       [0.83300458, 0.16699542],
       [0.83300458, 0.16699542]])