### Setup

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mutual_info_score, accuracy_score

### Run in terminal
wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
  
unzip bank+marketing.zip -d .<BR>
unzip bank.zip -d .

In [3]:
df = pd.read_csv('bank-full.csv', sep=';')

### Data Preparation

In [4]:
df_columns = [
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y',
]

In [5]:
df = df[df_columns]

Check if the missing values are presented in the features.

In [26]:
df.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

Question 1  
What is the most frequent observation (mode) for the column education?

In [28]:
print(df.education.mode()[0])

secondary


Question 2  
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [27]:
numerical_columns = list(df.dtypes[df.dtypes == 'int64'].index)

In [34]:
df_numerical = df[numerical_columns]

In [38]:
# correlation matrix
df_numerical.corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


What are the two features that have the biggest correlation?  
  
age and balance (0.097)  
day and campaign (0.162)  
day and pdays (-0.093)    
pdays and previous **(0.454)**

### Target encoding    
Now we want to encode the y variable.   
Let's replace the values yes/no with 1/0.  

In [6]:
df.y = (df.y == 'yes').astype(int)

### Split the data  
Split your data in train/val/test sets with 60%/20%/20% distribution.  
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.  
Make sure that the target value y is not in your dataframe.

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

### Question 3  
  
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.  
Round the scores to 2 decimals using round(score, 2). 
   
Which of these variables has the biggest mutual information score?  
  
contact  
education  
housing  
**poutcome**

In [52]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [54]:
for cat in categorical_columns:
    print(f"{cat}: " + str(mutual_info_score(df_full_train.y, df_full_train[cat]).round(2)))

job: 0.01
marital: 0.0
education: 0.0
housing: 0.01
contact: 0.01
month: 0.02
poutcome: 0.03


### Question 4

In [8]:
# One-hot encoding
dv = DictVectorizer(sparse=False)

In [9]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

### Training logistic regression

In [20]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [21]:
model.fit(X_train, y_train)

In [12]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [36]:
y_pred = model.predict_proba(X_val)[:, 1]

In [37]:
positive_decision = (y_pred >= 0.5)

In [70]:
# y_val

array([0, 0, 1, ..., 0, 0, 1])

In [71]:
# positive_decision.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [47]:
print((y_val == positive_decision).mean().round(10))

0.900906879


What accuracy did you get?  
  
0.6  
0.7  
0.8  
**0.9**

In [59]:
original_accuracy = (y_val == positive_decision).mean()
print(original_accuracy)

0.9009068790090687


In [1]:
# Hard predictions

In [22]:
y_pred = model.predict(X_val)

In [23]:
original_accuracy = accuracy_score(y_val, y_pred)

### Question 5  
  
Let's find the least useful feature using the feature elimination technique.  
Train a model with all these features (using the same parameters as in Q4).  
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.  
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [18]:
def model_accuracy(df_train, df_val, feature_to_exclude):

    # remove feature
    df_train_minus = df_train.drop(feature_to_exclude, axis=1)
    df_val_minus = df_val.drop(feature_to_exclude, axis=1)
    
    # one hot encoding
    train_dict = df_train_minus.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val_minus.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    # fit model
    model.fit(X_train, y_train)

    # y_pred = model.predict_proba(X_val)[:, 1]
    # positive_decision = (y_pred >= 0.5)
    y_pred = model.predict(X_val)


    # accuracy = (y_val == positive_decision).mean()
    # Calculate accuracy score
    accuracy = accuracy_score(y_val, y_pred)

    # print(f"feature excluded: {feature_to_exclude}, accuracy= " + str(accuracy) + ", difference= " + str(abs(accuracy - original_accuracy).round(10)))
    # print(f"feature excluded: {feature_to_exclude}, accuracy= " + str(accuracy) + ", difference= " + str(abs(accuracy - original_accuracy)))
    return [{"feature excluded": feature_to_exclude}, {"accuracy": accuracy}, {"difference": abs(accuracy - original_accuracy)}]

In [None]:
results = []

for column in df_train.columns:
    # model_accuracy(df_train=df_train, df_val=df_val, feature_to_exclude=column)
    results.append(model_accuracy(df_train=df_train, df_val=df_val, feature_to_exclude=column))

results

In [27]:
sorted_data = sorted(results, key=lambda x: x[2]['difference'])

In [None]:
sorted_data

In [29]:
for entry in sorted_data:
    print(entry)

[{'feature excluded': 'marital'}, {'accuracy': 0.9009068790090687}, {'difference': 0.0}]
[{'feature excluded': 'education'}, {'accuracy': 0.9009068790090687}, {'difference': 0.0}]
[{'feature excluded': 'pdays'}, {'accuracy': 0.9009068790090687}, {'difference': 0.0}]
[{'feature excluded': 'previous'}, {'accuracy': 0.9009068790090687}, {'difference': 0.0}]
[{'feature excluded': 'balance'}, {'accuracy': 0.9010174740101747}, {'difference': 0.0001105950011059953}]
[{'feature excluded': 'job'}, {'accuracy': 0.9011280690112807}, {'difference': 0.0002211900022119906}]
[{'feature excluded': 'housing'}, {'accuracy': 0.9011280690112807}, {'difference': 0.0002211900022119906}]
[{'feature excluded': 'contact'}, {'accuracy': 0.900464499004645}, {'difference': 0.00044238000442375913}]
[{'feature excluded': 'age'}, {'accuracy': 0.9013492590134926}, {'difference': 0.00044238000442387015}]
[{'feature excluded': 'day'}, {'accuracy': 0.9013492590134926}, {'difference': 0.00044238000442387015}]
[{'feature 

###Question 6  
  
Now let's train a regularized logistic regression.  
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].  
Train models using all the features as in Q4.  
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

In [15]:
c_values = [0.01, 0.1, 1, 10, 100]

for c in c_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = round(accuracy_score(y_val, y_pred), 3)

    print(accuracy)



0.898
0.901
0.901
0.901
0.901


Which of these C leads to the best accuracy on the validation set?  
  
Note: If there are multiple options, select the smallest C. - 0.898 -> 0.01