In [311]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [312]:
df = pd.read_csv('data/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [313]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing',
            'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
            'previous', 'poutcome', 'y']

In [314]:
df.y = (df.y == 'yes').astype(int)

In [315]:
df = df[features]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [316]:
# No nulls
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [317]:
# 1. What is the most frequent observation (mode) for the column education? secondary
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [318]:
# 2. What are the two features that have the biggest correlation? pdays and previous
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

In [319]:
numerical = ['age','balance','day','duration','campaign','pdays','previous']

In [320]:
categorical = ['job','marital','education','housing','contact','month','poutcome']

In [321]:
df[['age']].corrwith(df.balance)

age    0.097783
dtype: float64

In [322]:
df[['day']].corrwith(df.campaign)

day    0.16249
dtype: float64

In [323]:
df[['day']].corrwith(df.pdays)

day   -0.093044
dtype: float64

In [324]:
df[['pdays']].corrwith(df.previous)

pdays    0.45482
dtype: float64

In [325]:
# Train test split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [326]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [327]:
# Making index sequencial, not mandatory and no effect
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [328]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [329]:
# Drop target variable from df
df_train = df_train.drop(['y'], axis=1)
df_val = df_val.drop(['y'], axis=1)
df_test = df_test.drop(['y'], axis=1)

In [330]:
# 3. Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only
# poutcome
def mutual_info_churn_score(series):
    return round(mutual_info_score(series, df_full_train.y),2)
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending = False)

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

In [331]:
# 4. What accuracy did you get? 0.9
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [332]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [333]:
model.fit(X_train, y_train)

In [334]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.01252233, 0.01036543, 0.14732673, ..., 0.05617135, 0.00940853,
       0.28917677])

In [335]:
bank = (y_pred>= 0.5)

In [336]:
acc_score = accuracy_score(y_val, bank)
acc_score

0.9015704490157045

In [337]:
# 5. Which of following feature has the smallest difference? previous
features = ['age', 'balance', 'marital', 'previous']

In [338]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
accuracies = []
for i in features:
    df_train_temp = df_train[features].copy()
    df_val_temp = df_val[features].copy()

    df_train_temp.drop(i, axis=1, inplace=True)
    df_val_temp.drop(i, axis=1, inplace=True)

    train_dicts = df_train_temp.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)

    model.fit(X_train, y_train)

    val_dicts = df_val_temp.to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    y_pred = model.predict_proba(X_val)[:, 1]
    bank = (y_pred >= 0.5)
    accuracy = accuracy_score(y_val, bank)
    accuracies.append(accuracy)

    print(f'Accuracy without {i}: {accuracy:.4f}')
    
    print(accuracy_score(y_val, bank))

Accuracy without age: 0.8804
0.880446803804468
Accuracy without balance: 0.8803
0.880336208803362
Accuracy without marital: 0.8802
0.8802256138022562
Accuracy without previous: 0.8817
0.8816633488166334


In [339]:
accuracies

[0.880446803804468, 0.880336208803362, 0.8802256138022562, 0.8816633488166334]

In [340]:
for i in accuracies:
    print(i-acc_score)

-0.02112364521123644
-0.021234240212342437
-0.02134483521344832
-0.01990710019907105


In [341]:
# 6. Which of these C leads to the best accuracy on the validation set? 1
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values
df_train = df_train.drop(['y'], axis=1)
df_val = df_val.drop(['y'], axis=1)
df_test = df_test.drop(['y'], axis=1)
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [342]:
val_score = []
C= [0.01, 0.1, 1, 10, 100]
for i in C:
    model = LogisticRegression(solver='liblinear', C=i, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:,1]
    bank = (y_pred>= 0.5)
    val_score.append(accuracy_score(y_val, bank))

In [343]:
val_score

[0.8979208139792081,
 0.9009068790090687,
 0.9015704490157045,
 0.9009068790090687,
 0.9012386640123866]