In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200) 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mutual_info_score, accuracy_score

dataset https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [2]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no


In [3]:
# column names
print(df.columns)

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'], dtype='object')


In [4]:
df = df[[
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y',
]]

In [5]:
# distribution and missing values
print(df.describe().T)
df.isna().any()

            count         mean          std     min    25%    50%     75%       max
age       45211.0    40.936210    10.618762    18.0   33.0   39.0    48.0      95.0
balance   45211.0  1362.272058  3044.765829 -8019.0   72.0  448.0  1428.0  102127.0
day       45211.0    15.806419     8.322476     1.0    8.0   16.0    21.0      31.0
duration  45211.0   258.163080   257.527812     0.0  103.0  180.0   319.0    4918.0
campaign  45211.0     2.763841     3.098021     1.0    1.0    2.0     3.0      63.0
pdays     45211.0    40.197828   100.128746    -1.0   -1.0   -1.0    -1.0     871.0
previous  45211.0     0.580323     2.303441     0.0    0.0    0.0     0.0     275.0


age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

### Question 1

What is the most frequent observation (mode) for the column `education`?

In [6]:
print(df.education.mode()[0])

secondary


### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- 'age' and 'balance'
- 'day' and 'campaign'
- 'day' and `pdays`
-'pdays` and'previous`


### Target encoding

* Now we want to encode the'y` variable.
* Let's replace the values'yes`/`no` with'1`/`0`.

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [7]:
print(df.dtypes)
numerical_columns = list(df.dtypes[df.dtypes == 'int64'].index)
numerical_columns

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [8]:
# correlation matrix
df_numerical = df[numerical_columns]
df_numerical.corr()
# or
df.corr(numeric_only=True)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


What are the two features that have the biggest correlation?  
  
age - balance (0.098)  
day - campaign (0.162)  
day - pdays (-0.093)    
pdays - previous (0.455) <--

### Target encoding    
yes/no -> 1/0

In [9]:
df.y = (df.y == 'yes').astype(int)
df.y.describe()

count    45211.000000
mean         0.116985
std          0.321406
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: y, dtype: float64

### Split the data  
Split your data in train/val/test sets with 60%/20%/20% distribution.  
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.  
Make sure that the target value y is not in your dataframe.

In [47]:
### Split the data, seed to 42
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_train_full, test_size=0.25) # , random_state=42
# n_total = df.shape[0]
# n_test = df_test.shape[0]
# n_train = n_total - 2*n_test
# df_train = df_train_full.loc[:n_train]
# df_val = df_train_full.loc[n_train:]


df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`

In [48]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [49]:
for cat in categorical_columns:
    print(f"{cat}: " + str(mutual_info_score(df_train_full.y, df_train_full[cat]).round(2)))

job: 0.01
marital: 0.0
education: 0.0
housing: 0.01
contact: 0.01
month: 0.02
poutcome: 0.03


### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [50]:
# One-hot encoding
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [51]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [52]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [53]:
y_pred = model.predict_proba(X_val)[:, 1]

In [54]:
positive_decision = (y_pred >= 0.5)

In [55]:
accuracy = (y_val == positive_decision).mean()
print(accuracy)

0.9031187790311878


In [56]:
accuracy0 = accuracy_score(y_val, model.predict(X_val))
print(f"{accuracy0:5.2f}")

 0.90


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.

In [58]:
def accuracy_excluded_feature(df_train, df_val, feature_to_exclude, y_val):

    df_train_ex = df_train.drop(feature_to_exclude, axis=1)
    df_val_ex = df_val.drop(feature_to_exclude, axis=1)
    
    # one hot encoding
    train_dict = df_train_ex.to_dict(orient='records')
    val_dict = df_val_ex.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)

    print(f"accuracy={accuracy:10.8f}, difference={abs(accuracy - accuracy0):10.8f} {feature_to_exclude}")


In [59]:
results = []

for column in ['age', 'balance', 'marital', 'previous']: # df_train.columns:
    results.append(accuracy_excluded_feature(df_train, df_val, column, y_val))


accuracy=0.90522008, difference=0.00210131 age
accuracy=0.90466711, difference=0.00154833 balance
accuracy=0.90400354, difference=0.00088476 marital
accuracy=0.90444592, difference=0.00132714 previous


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

In [60]:
c_values = [0.01, 0.1, 1, 10, 100]

for c in c_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = round(accuracy_score(y_val, y_pred), 5)

    print(c, '\t', accuracy)



0.01 	 0.90024
0.1 	 0.90279
1 	 0.90312
10 	 0.90511
100 	 0.90478
