In [1]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-10 12:11:47--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [       <=>          ] 999.85K   724KB/s    in 1.4s    

2024-10-10 12:11:50 (724 KB/s) - ‘bank+marketing.zip’ saved [1023843]



In [3]:
import os
import glob
from zipfile import ZipFile 

# download files and remove unnecessary ones
with ZipFile("bank+marketing.zip", 'r') as zObject:
    zObject.extractall()
with ZipFile("bank.zip", 'r') as zObject:
    zObject.extractall()

file_paths = ['bank_marketing.ipynb', 'bank-full.csv']

for clean_up in glob.glob("*.*"):
    print(clean_up)
    if not clean_up in file_paths:
        os.remove(clean_up)

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [10]:
column_list = """age,
job,
marital,
education,
balance,
housing,
contact,
day,
month,
duration,
campaign,
pdays,
previous,
poutcome,
y""".split(',\n')
column_list

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [12]:
df = pd.read_csv('bank-full.csv', sep=";")
df = df[column_list]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [13]:
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


# Question 1
**Answer**: `secondary`

In [14]:
df['education'].mode()

0    secondary
Name: education, dtype: object

# Question 2
**Answer**: `pdays` and `previous`

In [26]:
# df.select_dtypes(int).columns
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
df[numerical].corr().unstack().abs().sort_values().drop_duplicates()

duration  previous    0.001203
previous  age         0.001288
duration  pdays       0.001565
pdays     balance     0.003435
balance   day         0.004503
age       duration    0.004648
campaign  age         0.004760
day       age         0.009120
balance   campaign    0.014578
          previous    0.016674
duration  balance     0.021560
pdays     age         0.023758
day       duration    0.030206
campaign  previous    0.032855
day       previous    0.051710
duration  campaign    0.084570
campaign  pdays       0.088628
day       pdays       0.093044
age       balance     0.097783
day       campaign    0.162490
previous  pdays       0.454820
age       age         1.000000
dtype: float64

In [31]:
# target encoding
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

In [33]:
# split the data
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.2, random_state=42)

X_train, y_train = df_train.drop('y', axis=1), df_train['y']
X_val, y_val = df_val.drop('y', axis=1), df_val['y']
X_test, y_test = df_test.drop('y', axis=1), df_test['y']

# Question 3
**Answer**: `poutcome`

In [35]:
categorical = [i for i in df.columns if i not in numerical]
categorical.remove('y')
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [38]:
def calculate_mi(series, y=y_train):
    return mutual_info_score(series, y)

In [39]:
df_mi = X_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
poutcome,0.029389
month,0.024972
contact,0.013437
housing,0.010465
job,0.007172
education,0.002777
marital,0.002019


# Question 4
**Answer**: `0.9`

In [42]:
train_dict = X_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

In [43]:
val_dict = X_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [44]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [46]:
y_pred = model.predict_proba(X_val)[:, 1]
y_hat = y_pred > 0.5
accuracy = (y_val == y_hat).mean()
round(accuracy, 2)

np.float64(0.9)

# Question 5