# Homework 03 - Classification

## Preparations

### Dependencies

In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path


### Download and Unzip Data

In [None]:
!curl -o ../data/bank_marketing.zip https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip -d ../data/bank_marketing ../data/bank_marketing.zip
!unzip -d ../data/bank_marketing/bank ../data/bank_marketing/bank.zip
!rm -r ../data/bank_marketing.zip

### Set up Paths

In [3]:
PATH_DATA = Path("../data")
PATH_DATA_BANKING = PATH_DATA / "bank_marketing/bank/bank-full.csv"

## Data Preparation and Exploratory Data Analysis

In [5]:
# read data
data_banking = pd.read_csv(PATH_DATA_BANKING, delimiter=";")

In [9]:
# make a list containing the names of the required columns
req_cols = ["age", "job", "marital", "education",
            "balance", "housing", "contact", "day",
            "month", "duration", "campaign", "pdays",
            "previous", "poutcome", "y"]

# select required columns from data
data_banking = data_banking[req_cols]
data_banking

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [11]:
# check if there are missing values in the data
data_banking.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

No, there are no missing values in the data.

### Question 1

In [13]:
# determine most frequent observation in the column "education"
data_banking["education"].mode()

0    secondary
Name: education, dtype: object

The most frequent observation in the column `education` is `secondary`.

### Question 2

In [18]:
# get numerical columns
# because correlation is not defined for categorical ones
data_banking_numerical = data_banking.select_dtypes(include=["number"])
data_banking_numerical

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [24]:
# calculate correlation matrix
correlation_matrix = data_banking_numerical.corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [35]:
# get list of all correlations we are asked to compare in the question
correlations = [
    correlation_matrix["age"]["balance"],
    correlation_matrix["day"]["campaign"],
    correlation_matrix["day"]["pdays"],
    correlation_matrix["pdays"]["previous"]
]

# get a list of combinations corresponding to the correlations
combinations = [
    "age_balance",
    "day_campaign",
    "day_pdays",
    "pdays_previous"
]

list(zip(combinations, correlations))

[('age_balance', np.float64(0.09778273937134807)),
 ('day_campaign', np.float64(0.16249021632619218)),
 ('day_pdays', np.float64(-0.0930440737729405)),
 ('pdays_previous', np.float64(0.4548196354805043))]

The two features having the biggest correlation are `pdays`and `previous`.

In [39]:
# prepare label data
data_banking["y"] = data_banking["y"].replace({"yes": 1, "no": 0})

  data_banking["y"] = data_banking["y"].replace({"yes": 1, "no": 0})


In [41]:
# separate featuers and labels and save each to one object
X = data_banking.drop(columns=["y"])
y = data_banking["y"]

In [44]:
# have a look at new object X
X

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown


In [45]:
# have a look at new object
y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [47]:
# make first split: between test and train/val
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [48]:
# make second split: between train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)

### Question 3

In [None]:
# get categorical variables
