In [1]:
import requests
import zipfile

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
# Turn off warnings
warnings.filterwarnings("ignore")

In [2]:

# URL of the dataset
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

# Download the file
response = requests.get(url)

# Write the file to a local path
with open('bank_marketing.zip', 'wb') as file:
    file.write(response.content)

print("Dataset downloaded successfully.")


Dataset downloaded successfully.


In [3]:

# Unpack the dataset
with zipfile.ZipFile('bank_marketing.zip', 'r') as zip_ref:
    zip_ref.extractall()

print("Dataset unpacked successfully.")


Dataset unpacked successfully.


In [2]:
# Load the dataset
data = pd.read_csv('bank/bank-full.csv', sep=';')

# Select the specified columns
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
           'previous', 'poutcome', 'y']
data = data[columns]

# Display the first few rows
print(data.head())


   age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome   y  
0       261         1     -1         0  unknown  no  
1       151         1     -1         0  unknown  no  
2        76         1     -1         0  unknown  no  
3        92         1     -1         0  unknown  no  
4       198         1     -1         0  unknown  no  


In [3]:
# Check for missing values in the selected features
missing_values = data.isnull().sum()

# Display the missing values
print("Missing values in each feature:")
print(missing_values)

# If no missing values, display a message
if missing_values.sum() == 0:
    print("No missing values found.")

Missing values in each feature:
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
No missing values found.


## Question 1
What is the most frequent observation (mode) for the column education?

In [4]:
# Find the mode (most frequent observation) for the 'education' column
education_mode = data['education'].mode()[0]

# Display the mode
print(f"The most frequent observation (mode) for the column 'education' is: {education_mode}")


The most frequent observation (mode) for the column 'education' is: secondary


## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [5]:
# Select numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Create a DataFrame with only numerical columns
numerical_data = data[numerical_features]

# Display the first few rows to confirm
print(numerical_data.head())


   age  balance  day  duration  campaign  pdays  previous
0   58     2143    5       261         1     -1         0
1   44       29    5       151         1     -1         0
2   33        2    5        76         1     -1         0
3   47     1506    5        92         1     -1         0
4   33        1    5       198         1     -1         0


In [6]:
# Compute the correlation matrix
correlation_matrix = numerical_data.corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)


Correlation Matrix:
               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [7]:
# Unstack the correlation matrix to find the pair of features with the highest correlation
correlation_pairs = correlation_matrix.unstack()

# Sort the correlation pairs by their absolute value in descending order
sorted_pairs = correlation_pairs.abs().sort_values(ascending=False)

# Exclude the diagonal (i.e., correlation of a feature with itself)
sorted_pairs = sorted_pairs[sorted_pairs < 1]

# Get the pair with the highest correlation
most_correlated_pair = sorted_pairs.idxmax()

# Display the most correlated pair and their correlation value
print(f"The two features with the highest correlation are: {most_correlated_pair[0]} and {most_correlated_pair[1]}")


The two features with the highest correlation are: previous and pdays


## Target encoding
Now we want to encode the y variable.
Let's replace the values yes/no with 1/0.

In [8]:
# Replace 'yes' with 1 and 'no' with 0 in the 'y' column
data['y'] = data['y'].replace({'yes': 1, 'no': 0})

# Confirm the transformation
print(data['y'].head())


0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64


## Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

In [10]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = data.drop('y', axis=1)  # Remove the target column from the features
y = data['y']  # Target column

# Split the data: 60% train, 20% validation, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


In [11]:
# List of categorical features
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# Slice the training and validation sets to keep only the categorical features
X_train_categorical = X_train[categorical_features]
X_val_categorical = X_val[categorical_features]

# Display the first few rows to confirm
print(X_train_categorical.head())


                 job  marital  education housing   contact month poutcome
20326     technician   single   tertiary     yes  cellular   aug  unknown
24301   entrepreneur  married  secondary     yes  cellular   nov  unknown
38618    blue-collar  married  secondary     yes  cellular   may  unknown
18909      housemaid  married    primary      no  cellular   aug  unknown
23081  self-employed  married   tertiary      no  cellular   aug  unknown


In [12]:

# Convert the training and validation sets into lists of dictionaries
train_dicts = X_train_categorical.to_dict(orient='records')
val_dicts = X_val_categorical.to_dict(orient='records')

# Initialize the DictVectorizer
dv = DictVectorizer(sparse=False)

# Fit the DictVectorizer on the training data and transform both training and validation data
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)

# Display the feature names created by the DictVectorizer (optional)
print(dv.feature_names_)


['contact=cellular', 'contact=telephone', 'contact=unknown', 'education=primary', 'education=secondary', 'education=tertiary', 'education=unknown', 'housing=no', 'housing=yes', 'job=admin.', 'job=blue-collar', 'job=entrepreneur', 'job=housemaid', 'job=management', 'job=retired', 'job=self-employed', 'job=services', 'job=student', 'job=technician', 'job=unemployed', 'job=unknown', 'marital=divorced', 'marital=married', 'marital=single', 'month=apr', 'month=aug', 'month=dec', 'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar', 'month=may', 'month=nov', 'month=oct', 'month=sep', 'poutcome=failure', 'poutcome=other', 'poutcome=success', 'poutcome=unknown']


## Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).


In [14]:
# Calculate mutual information for the categorical features in the training data
mi_scores = mutual_info_classif(X_train_encoded, y_train)

# Round the scores to 2 decimal places
mi_scores_rounded = [round(score, 2) for score in mi_scores]

# Map the scores back to their respective features
mi_scores_dict = dict(zip(dv.feature_names_, mi_scores_rounded))

# Display the mutual information scores for all features
print("Mutual Information Scores between y and categorical features:")
for feature, score in mi_scores_dict.items():
    print(f"{feature}: {score}")


Mutual Information Scores between y and categorical features:
contact=cellular: 0.01
contact=telephone: 0.0
contact=unknown: 0.01
education=primary: 0.0
education=secondary: 0.0
education=tertiary: 0.0
education=unknown: 0.0
housing=no: 0.01
housing=yes: 0.02
job=admin.: 0.0
job=blue-collar: 0.0
job=entrepreneur: 0.0
job=housemaid: 0.0
job=management: 0.0
job=retired: 0.0
job=self-employed: 0.0
job=services: 0.0
job=student: 0.0
job=technician: 0.0
job=unemployed: 0.0
job=unknown: 0.0
marital=divorced: 0.0
marital=married: 0.01
marital=single: 0.01
month=apr: 0.0
month=aug: 0.0
month=dec: 0.0
month=feb: 0.0
month=jan: 0.0
month=jul: 0.0
month=jun: 0.0
month=mar: 0.01
month=may: 0.01
month=nov: 0.0
month=oct: 0.0
month=sep: 0.01
poutcome=failure: 0.0
poutcome=other: 0.0
poutcome=success: 0.03
poutcome=unknown: 0.02


In [15]:
# Find the feature with the highest mutual information score
max_mi_feature = max(mi_scores_dict, key=mi_scores_dict.get)
max_mi_score = mi_scores_dict[max_mi_feature]

# Display the result
print(f"The feature with the highest mutual information score is: {max_mi_feature} with a score of {max_mi_score}")


The feature with the highest mutual information score is: poutcome=success with a score of 0.03


## Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

In [16]:
from sklearn.feature_extraction import DictVectorizer

# Convert the training and validation sets into lists of dictionaries
train_dicts = X_train_categorical.to_dict(orient='records')
val_dicts = X_val_categorical.to_dict(orient='records')

# Initialize DictVectorizer
dv = DictVectorizer(sparse=False)

# Fit the DictVectorizer on the training data and transform both training and validation data
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)

# Check the encoded feature names (optional)
print(dv.feature_names_)


['contact=cellular', 'contact=telephone', 'contact=unknown', 'education=primary', 'education=secondary', 'education=tertiary', 'education=unknown', 'housing=no', 'housing=yes', 'job=admin.', 'job=blue-collar', 'job=entrepreneur', 'job=housemaid', 'job=management', 'job=retired', 'job=self-employed', 'job=services', 'job=student', 'job=technician', 'job=unemployed', 'job=unknown', 'marital=divorced', 'marital=married', 'marital=single', 'month=apr', 'month=aug', 'month=dec', 'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar', 'month=may', 'month=nov', 'month=oct', 'month=sep', 'poutcome=failure', 'poutcome=other', 'poutcome=success', 'poutcome=unknown']


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Train (fit) the model on the training set
model.fit(X_train_encoded, y_train)

# Predict the target values on the validation set
y_pred = model.predict(X_val_encoded)


In [18]:
# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred)

# Round the accuracy to 2 decimal places
accuracy_rounded = round(accuracy, 2)

# Display the accuracy
print(f"Accuracy on the validation set: {accuracy_rounded}")


Accuracy on the validation set: 0.89


## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).

In [24]:
# Train the baseline model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Calculate the baseline accuracy on the validation set
y_pred = model.predict(X_val_encoded)
baseline_accuracy = accuracy_score(y_val, y_pred)

# Display the baseline accuracy
print(f"Baseline Accuracy: {round(baseline_accuracy, 2)}")


Baseline Accuracy: 0.89


In [25]:
# List of features to exclude
features_to_test = ['age', 'balance', 'marital', 'previous']

# Initialize a dictionary to store accuracy differences
accuracy_differences = {}

# Loop through each feature in the specified list
for feature in features_to_test:
    # Drop the feature from the original training and validation sets
    X_train_dropped = X_train.drop(columns=[feature])
    X_val_dropped = X_val.drop(columns=[feature])
    
    # Convert to dictionaries and re-encode using DictVectorizer
    train_dicts_dropped = X_train_dropped.to_dict(orient='records')
    val_dicts_dropped = X_val_dropped.to_dict(orient='records')
    
    # Re-encode the remaining features
    X_train_encoded_dropped = dv.fit_transform(train_dicts_dropped)
    X_val_encoded_dropped = dv.transform(val_dicts_dropped)
    
    # Train a new model without the dropped feature
    model.fit(X_train_encoded_dropped, y_train)
    
    # Predict on the validation set without the feature
    y_pred_dropped = model.predict(X_val_encoded_dropped)
    
    # Calculate the accuracy without the feature
    dropped_accuracy = accuracy_score(y_val, y_pred_dropped)
    
    # Calculate the difference from the baseline accuracy
    accuracy_diff = baseline_accuracy - dropped_accuracy
    
    # Store the result in the dictionary
    accuracy_differences[feature] = accuracy_diff

# Display the accuracy differences
for feature, diff in accuracy_differences.items():
    print(f"Feature: {feature}, Accuracy Difference: {round(diff, 4)}")


Feature: age, Accuracy Difference: -0.0114
Feature: balance, Accuracy Difference: -0.0112
Feature: marital, Accuracy Difference: -0.0118
Feature: previous, Accuracy Difference: -0.0118


In [26]:
# Find the feature with the smallest difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
smallest_difference = accuracy_differences[least_useful_feature]

# Display the least useful feature and the smallest difference
print(f"The least useful feature (from the given options) is: {least_useful_feature} with a difference of {round(smallest_difference, 4)}")


The least useful feature (from the given options) is: marital with a difference of -0.0118


## Question 6
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

In [29]:

# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store accuracy for each C value
accuracy_results = {}

# Loop through each value of C, train the model, and evaluate accuracy
for C in C_values:
    # Initialize the logistic regression model with the given C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Train the model on the training data
    model.fit(X_train_encoded, y_train)
    
    # Predict the target values on the validation set
    y_pred = model.predict(X_val_encoded)
    
    # Calculate the accuracy on the validation set
    accuracy = accuracy_score(y_val, y_pred)
    
    # Round the accuracy to 3 decimal places
    accuracy_rounded = round(accuracy, 4)
    
    # Store the accuracy in the dictionary
    accuracy_results[C] = accuracy_rounded

# Display the accuracy for each value of C
print("Accuracy for each C value:")
for C, acc in accuracy_results.items():
    print(f"C = {C}: Accuracy = {acc}")


Accuracy for each C value:
C = 0.01: Accuracy = 0.8881
C = 0.1: Accuracy = 0.8898
C = 1: Accuracy = 0.8896
C = 10: Accuracy = 0.8894
C = 100: Accuracy = 0.8894


In [31]:
# Find the C value with the highest accuracy
best_C = max(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_C]

# Display the best C value and its corresponding accuracy
print(f"\nThe best C value is: {best_C} with an accuracy of {best_accuracy}")



The best C value is: 0.1 with an accuracy of 0.8898
