In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [3]:
df.columns = df.columns.str.replace('"', '')

In [4]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [5]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign',
'pdays', 'previous', 'poutcome', 'y']

In [6]:
df[features].isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
df['education'].mode()

0    secondary
Name: education, dtype: object

In [8]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
numerical = ['age', 'balance', 'day', 'duration','campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [10]:
ab = df['age'].corr(df['balance'])
ab

0.09778273937134747

In [11]:
dc = df['day'].corr(df['campaign'])
dc

0.16249021632619284

In [12]:
dp = df['day'].corr(df['pdays'])
dp

-0.09304407377294052

In [13]:
pp = df['pdays'].corr(df['previous'])
pp

0.45481963548050114

In [14]:
df.y = (df.y == 'yes').astype(int)

In [15]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,0


### Split the Dataset

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

(36168, 9043)

In [18]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [19]:
df_test = df_test.drop(df_test.index[-1])

In [20]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9042)

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['y']
y_val = df_val['y']
y_test = df_test['y']

In [22]:
del df_train['y']
del df_val['y']
del df_test['y']

### Calculate Mutual Information Score

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
def mis(series):
    return mutual_info_score(series, y_train)

In [25]:
mi = df_train[categorical].apply(mis)
mi.sort_values(ascending=False)

poutcome     0.029533
month        0.025090
contact      0.013356
housing      0.010343
job          0.007316
loan         0.002714
education    0.002697
marital      0.002050
default      0.000293
dtype: float64

### One-hot Encoding using DictVectorizer

In [26]:
from sklearn.feature_extraction import DictVectorizer

In [27]:
train_dicts= df_train[numerical].to_dict(orient='records')

In [28]:
train_dicts[0]

{'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

##### Create feature matrix

In [29]:
dv = DictVectorizer(sparse=False)

In [30]:
val_dicts = df_val[numerical].to_dict(orient='records')

In [31]:
X_train = dv.fit_transform(train_dicts)

In [32]:
X_val = dv.transform(val_dicts)

In [33]:
X_train.shape, X_val.shape

((27126, 7), (9042, 7))

### LogisticRegression

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)

In [37]:
round((y_val == y_decision).mean(), 2)

0.89

### Display Differences

In [38]:
from IPython.display import display

In [39]:
global_y_mean = df_full_train.y.mean()

In [43]:
smallest_diff_value = float('inf')
for n in numerical:
    print(n)
    df_group = df_full_train.groupby(n).y.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_y_mean
    mean_diff = df_group['diff'].mean()
    display(mean_diff)
    
    if mean_diff < smallest_diff_value:
        smallest_diff_value = mean_diff
        smallest_diff_feature = n

age


0.15857752975131117

balance


0.005046699469794647

day


0.011541940378786749

duration


0.2655993682498444

campaign


-0.08513099249526401

pdays


0.1672077455396622

previous


0.06347391136358485

### Smallest 'C'

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [48]:
C_values = [0.01, 0.1, 1, 10, 100]

In [49]:
best_C = None
best_accuracy = 0

In [51]:
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    print(f"C: {C}, Accuracy: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

C: 0.01, Accuracy: 0.8885202388852024
C: 0.1, Accuracy: 0.8895155938951559
C: 1, Accuracy: 0.889294403892944
C: 10, Accuracy: 0.889294403892944
C: 100, Accuracy: 0.88940499889405
