In [1]:
import pandas as pd
import numpy as np

In [8]:
path = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
!curl -sSL $path > 'bank+marketing.zip'


In [10]:
!unzip 'bank+marketing.zip'

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [11]:
!unzip bank.zip
!unzip bank-additional.zip 

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                
Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/bank-additional/
  inflating: __MACOSX/bank-additional/._.DS_Store  
  inflating: bank-additional/.Rhistory  
  inflating: bank-additional/bank-additional-full.csv  
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: __MACOSX/._bank-additional  


In [13]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [36]:
columns = ['age',
'job',
'marital',
'education',
'balance',
'housing',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome',
'y']
features = ['age',
'job',
'marital',
'education',
'balance',
'housing',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome']

In [19]:
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [29]:
categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)

In [71]:
categorical_cols.remove('y')

In [40]:
numerical_cols = list(df[features].dtypes[df[features].dtypes != 'object'].index)

In [30]:
for col in categorical_cols:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [31]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [34]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [35]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [41]:
df[numerical_cols].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [49]:
df.y=(df.y == 'yes').astype(int)

In [52]:
len(df)

45211

In [51]:
from sklearn.model_selection import train_test_split

In [53]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [54]:
len(df_full_train), len(df_test)

(36168, 9043)

In [55]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [56]:
len(df_train), len(df_val)

(27126, 9042)

In [59]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [61]:
del df_train['y']
del df_val['y']
del df_test['y']

In [62]:
from sklearn.metrics import mutual_info_score

In [65]:
def mutual_info_train_score(series):
    return mutual_info_score(series, df_full_train.y)

In [74]:
mi = df_full_train[categorical_cols].apply(mutual_info_train_score)
round(mi.sort_values(ascending=False),2)

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

In [75]:
from sklearn.feature_extraction import DictVectorizer

In [76]:
dicts_train = df_train.to_dict(orient='records')

In [77]:
dv = DictVectorizer(sparse=False)

In [79]:
X_train = dv.fit_transform(dicts_train)
X_train[0]

array([ 3.2e+01,  1.1e+03,  1.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,
        1.1e+01,  6.7e+01,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        0.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00, -1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00])

In [96]:
dicts_val = df_val.to_dict(orient='records')
X_val = dv.transform(dicts_val)

In [80]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [82]:
from sklearn.linear_model import LogisticRegression 

In [83]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [84]:
model.fit(X_train, y_train)

In [91]:
y_pred_proba = model.predict_proba(X_train)[:,1]

In [92]:
y_pred = (y_pred_proba >= 0.5).astype(int)

In [94]:
(y_pred == y_train).mean()

0.9020865590208655

In [97]:
y_pred_proba = model.predict_proba(X_val)[:,1]
y_pred = (y_pred_proba >= 0.5).astype(int)

In [99]:
round((y_pred == y_val).mean(),2)

0.9