# Homework 03 - Classification

## Preparations

### Dependencies

In [122]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from pathlib import Path


### Download and Unzip Data

In [None]:
!curl -o ../data/bank_marketing.zip https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip -d ../data/bank_marketing ../data/bank_marketing.zip
!unzip -d ../data/bank_marketing/bank ../data/bank_marketing/bank.zip
!rm -r ../data/bank_marketing.zip

### Set up Paths

In [124]:
PATH_DATA = Path("../data")
PATH_DATA_BANKING = PATH_DATA / "bank_marketing/bank/bank-full.csv"

## Data Preparation and Exploratory Data Analysis

In [125]:
# read data
data_banking = pd.read_csv(PATH_DATA_BANKING, delimiter=";")

In [126]:
# make a list containing the names of the required columns
req_cols = ["age", "job", "marital", "education",
            "balance", "housing", "contact", "day",
            "month", "duration", "campaign", "pdays",
            "previous", "poutcome", "y"]

# select required columns from data
data_banking = data_banking[req_cols]
data_banking

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [127]:
# check if there are missing values in the data
data_banking.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

No, there are no missing values in the data.

### Question 1

In [128]:
# determine most frequent observation in the column "education"
data_banking["education"].mode()

0    secondary
Name: education, dtype: object

The most frequent observation in the column `education` is `secondary`.

### Question 2

In [129]:
# get numerical columns
# because correlation is not defined for categorical ones
data_banking_numerical = data_banking.select_dtypes(include=["number"])
data_banking_numerical

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [130]:
# calculate correlation matrix
correlation_matrix = data_banking_numerical.corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [131]:
# get list of all correlations we are asked to compare in the question
correlations = [
    correlation_matrix["age"]["balance"],
    correlation_matrix["day"]["campaign"],
    correlation_matrix["day"]["pdays"],
    correlation_matrix["pdays"]["previous"]
]

# get a list of combinations corresponding to the correlations
combinations = [
    "age_balance",
    "day_campaign",
    "day_pdays",
    "pdays_previous"
]

list(zip(combinations, correlations))

[('age_balance', np.float64(0.09778273937134807)),
 ('day_campaign', np.float64(0.16249021632619218)),
 ('day_pdays', np.float64(-0.0930440737729405)),
 ('pdays_previous', np.float64(0.4548196354805043))]

The two features having the biggest correlation are `pdays`and `previous`.

In [132]:
# prepare label data
data_banking["y"] = data_banking["y"].replace({"yes": 1, "no": 0})

  data_banking["y"] = data_banking["y"].replace({"yes": 1, "no": 0})


In [133]:
# separate featuers and labels and save each to one object
X = data_banking.drop(columns=["y"])
y = data_banking["y"]

In [134]:
# have a look at new object X
X

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown


In [135]:
# have a look at new object
y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [136]:
# make first split: between test and train/val
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [137]:
# make second split: between train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

### Question 3

In [138]:
# get categorical variables
categorical_train = X_train.select_dtypes(include=["object"])

# join with y
categorical_train = pd.concat([categorical_train, y_train], axis=1)

categorical_train

Unnamed: 0,job,marital,education,housing,contact,month,poutcome,y
20326,technician,single,tertiary,yes,cellular,aug,unknown,0
24301,entrepreneur,married,secondary,yes,cellular,nov,unknown,0
38618,blue-collar,married,secondary,yes,cellular,may,unknown,0
18909,housemaid,married,primary,no,cellular,aug,unknown,0
23081,self-employed,married,tertiary,no,cellular,aug,unknown,0
...,...,...,...,...,...,...,...,...
13264,services,single,secondary,no,cellular,jul,unknown,0
28829,technician,single,tertiary,no,cellular,jan,unknown,0
3844,technician,divorced,secondary,yes,unknown,may,unknown,0
15597,services,single,secondary,no,cellular,jul,unknown,1


In [139]:
# custom function for looping though columns to get mutual info score
def mutinfo_custom(df, colname):
    return mutual_info_score(df["y"], df[colname])

for column in categorical_train.columns:
    print(column, mutinfo_custom(df=categorical_train, colname=column))

job 0.007316082778474635
marital 0.0020495925927810216
education 0.0026967549991295282
housing 0.010343105891750026
contact 0.013356062198247219
month 0.02509003344365025
poutcome 0.029532821290436224
y 0.35748273204991365


`poutcome` has the largest mutual information score with `y`.

## Model Training

### Question 4

In [140]:
# one hot encode categorical variables

# for train set, this is necessary immediately
X_train_1h = pd.get_dummies(X_train).astype(int)

# for val and test, it's not necessary immediately
# but I'll already do it, because I'll need it later anyway
X_val_1h = pd.get_dummies(X_val).astype(int)
X_test_1h = pd.get_dummies(X_test).astype(int)

In [141]:
# define a model
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [142]:
# fit the model to the train data set
model.fit(X_train_1h, y_train)

## Model Evaluation

In [143]:
# get predictions from model
y_val_pred = model.predict(X_val_1h)
y_val_pred

array([0, 0, 0, ..., 0, 0, 0])

In [144]:
# calculate accuracy using sklearn's built in function
accuracy_val = accuracy_score(y_true=y_val, y_pred=y_val_pred)

accuracy_val

0.9007962840079629

### Question 5

In [154]:
# function for one run of feature elimination, training and evaluation
def train_eval_eliminate_one_feature(feature, X_t=X_train, y_t=y_train, X_v=X_val,
                                     y_v=y_val, accuracy_original=accuracy_val):
    
    # eliminate one feature in train and val data
    X_t = X_t.drop(feature, axis=1)
    X_v = X_v.drop(feature, axis=1)
    
    # one hot encode
    X_t = pd.get_dummies(X_t).astype(int)
    X_v = pd.get_dummies(X_v).astype(int)
    
    # define a model
    model_elim = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    
    # fit the model to the training data
    model_elim.fit(X=X_t, y=y_t)
    
    # get predictions from model
    y_p = model_elim.predict(X=X_v)
    
    # get accuracy
    accuracy = accuracy_score(y_true=y_v,
                              y_pred=y_p)
    
    # calculate difference between accuracy of model trained on full features
    # and model with one feature eliminated from data
    accuracy_diff = accuracy_original - accuracy
    
    return {feature: accuracy_diff}

In [155]:
# function for running the entire feature elimination technique
def feature_elimination(X_t=X_train, features_all=None):
    
    # make empty dictionary to contain results
    features_differences = {}
    
    # if no features have been passed get all features from passed data
    if features_all == None:
        features_all = X_t.columns
    
    # call the function for calculating feature difference for all features
    for feature in features_all:
        # add the result to the dictionary
        features_differences.update(train_eval_eliminate_one_feature(feature=feature))
    
    return features_differences

In [156]:
# function for determining the minimum distance
def determine_min_diff(result):
    
    # find the key with the smallest absolute value
    key_of_smallest_abs_value = min(result, key=lambda k: abs(result[k]))
    
    # get the smallest absolute value
    smallest_abs_value = result[key_of_smallest_abs_value]
    
    return key_of_smallest_abs_value, smallest_abs_value

In [157]:
# call the function and save the result in an object
feature_elimination_result = feature_elimination(X_train)

# get smallest difference
determine_min_diff(feature_elimination_result)

('campaign', 0.0)

The smallest value does not belong to any of the answers available in the multiple choice, so run the function using only the four features that are available.

In [158]:
# run the feature elimination method for the features available in the question only
feature_elimination_q5 = feature_elimination(X_t=X_train, features_all=["age", "balance", "marital", "previous"])

# get smallest difference
determine_min_diff(feature_elimination_q5)

('age', 0.0001105950011059953)

The feature `age` has the smallest difference of the ones available in the multiple choice answer.

## Regularization

### Question 6

In [161]:
# make list containing the different values of the regularization parameter
C_list = [0.01, 0.1, 1, 10, 100]

In [169]:
# 
def evaluate_regularization(regC, X_t=X_train_1h, y_t=y_train, X_v=X_val_1h, y_v=y_val):
    
    # define regularized model
    model_reg = LogisticRegression(solver="liblinear", C=regC, max_iter=1000, random_state=42)
    
    # fit model
    model_reg.fit(X_t, y_t)
    
    # get predictions
    predictions = model_reg.predict(X_v)
    
    # calculate accuracy
    accuracy = round(accuracy_score(y_true=y_v, y_pred=predictions), 3)
    
    return regC, accuracy

In [171]:
for C in C_list:
    print(evaluate_regularization(regC=C))

(0.01, 0.898)
(0.1, 0.901)
(1, 0.901)
(10, 0.901)
(100, 0.901)


0.1, 1, 10, 100 all have an accuracy of 0.901 when rounded to three digits.
In this case, we're supposed to pick the smallest C.
So, this questions's answer is: 0.1