In [22]:
import pandas as pd
import numpy as np

# Dataset

In [23]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip -O bank+marketing.zip
!unzip -o bank+marketing.zip

--2024-10-13 17:22:09--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [   <=>              ] 999.85K  1.79MB/s    in 0.5s    

2024-10-13 17:22:10 (1.79 MB/s) - ‘bank+marketing.zip’ saved [1023843]

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [24]:
!unzip -o bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [25]:
df = pd.read_csv('bank-full.csv', sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# Data preparation

In [26]:
base = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 
        'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = df[base]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [27]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1

In [28]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [29]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [30]:
numerical = list(df.dtypes[df.dtypes == 'int64'].index)
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical.remove('y')
numerical, categorical

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome'])

# Question 2

# Correlation

In [31]:
df[numerical].corr().abs().style.background_gradient(cmap='viridis')

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,0.00912,0.004648,0.00476,0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,0.014578,0.003435,0.016674
day,0.00912,0.004503,1.0,0.030206,0.16249,0.093044,0.05171
duration,0.004648,0.02156,0.030206,1.0,0.08457,0.001565,0.001203
campaign,0.00476,0.014578,0.16249,0.08457,1.0,0.088628,0.032855
pdays,0.023758,0.003435,0.093044,0.001565,0.088628,1.0,0.45482
previous,0.001288,0.016674,0.05171,0.001203,0.032855,0.45482,1.0


# Target encoding

In [32]:
df['y'] = (df['y'] == 'yes').astype(int)
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

# Split the data

In [33]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=len(df_test), random_state=42)

len(df_train), len(df_val), len(df_test)

(27125, 9043, 9043)

In [34]:
df_train_full = df_train_full.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_full = df_train_full['y'].values
y_train = df_train['y'].values
y_val = df_val['y'].values
y_test = df_test['y'].values

del df_train_full['y']
del df_train['y']
del df_val['y']
del df_test['y']

# Question 3

# Mutual Information

In [35]:
from sklearn.metrics import mutual_info_score

def mutual_info_term_score(series):
    return mutual_info_score(series, y_train).round(2)

In [36]:
mi = df_train[['contact', 'education', 'housing', 'poutcome']].apply(mutual_info_term_score)
mi.sort_values(ascending=False)

poutcome     0.03
contact      0.01
housing      0.01
education    0.00
dtype: float64

# Question 4

# One-Hot Encoding

In [37]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Logistic regression

In [38]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [45]:
y_pred = model.predict_proba(X_val)[:, 1]
term_decision = (y_pred >= 0.5)
accuracy = (y_val == term_decision).mean()
accuracy.round(2)

0.9004755059161783

# Question 5

# Feature Elimination Technique

In [44]:
df_feature_elim = pd.DataFrame(columns=["feature", "accuracy", "difference"])

for column in ['age', 'balance', 'marital', 'previous']:

    dv = DictVectorizer(sparse=False)

    columns = df_train.columns.difference([column])

    train_dict = df_train[columns].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[columns].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_decision = (y_pred >= 0.5)
    accuracy_ = (y_val == term_decision).mean()
    
    df_feature_elim.loc[len(df_feature_elim)] = [column, accuracy_, abs(accuracy - accuracy_)]

df_feature_elim.sort_values(by='difference', ascending=True).reset_index(drop=True)

Unnamed: 0,feature,accuracy,difference
0,previous,0.900476,0.0
1,marital,0.900365,0.000111
2,balance,0.900807,0.000332
3,age,0.901028,0.000553


# Question 6

In [41]:
df_regul_regression = pd.DataFrame(columns=["C", "accuracy"])

dv = DictVectorizer(sparse=False)

train_dict = df_train[columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

for coef in [0.01, 0.1, 1, 10, 100]:
    
    model = LogisticRegression(solver='liblinear', C=coef, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_decision = (y_pred >= 0.5)
    accuracy_ = (y_val == term_decision).mean().round(3)
    
    df_regul_regression.loc[len(df_regul_regression)] = [coef, accuracy_]

df_regul_regression.sort_values(by='accuracy', ascending=False).reset_index(drop=True)

Unnamed: 0,C,accuracy
0,0.1,0.901
1,10.0,0.901
2,100.0,0.901
3,1.0,0.9
4,0.01,0.899
