In [2]:
import pandas as pd
import requests
import zipfile
import io
import os
import numpy as np

In [3]:
# Step 1: Download the zip file from the URL
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
response = requests.get(url)

# Create a new folder to extract the files into
extraction_folder = 'bank_marketing_files'
if not os.path.exists(extraction_folder):
    os.makedirs(extraction_folder)

# Step 2: Extract the contents of the main zip file into the new folder
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall(extraction_folder)

# Step 3: Check the contents of the folder to verify files
print("Contents after extracting bank+marketing.zip:")
print(os.listdir(extraction_folder))

Contents after extracting bank+marketing.zip:
['bank-names.txt', 'bank.csv', 'bank.zip', 'bank', 'bank-full.csv', 'bank-additional.zip']


In [5]:
# Step 4: Extract 'bank.zip' from the new folder
bank_zip_path = os.path.join(extraction_folder, 'bank.zip')
bank_extraction_folder = os.path.join(extraction_folder, 'bank')

with zipfile.ZipFile(bank_zip_path, 'r') as bank_zip:
    bank_zip.extractall(bank_extraction_folder)

# Step 5: Check the contents of the extracted 'bank/' folder
if not os.path.exists(bank_extraction_folder):
    os.makedirs(bank_extraction_folder)

if os.path.exists(bank_zip_path):
    with zipfile.ZipFile(bank_zip_path, 'r') as bank_zip:
        bank_zip.extractall(bank_extraction_folder)
    print("bank.zip extracted successfully into 'bank' folder.")
else:
    print("bank.zip not found!")

bank.zip extracted successfully into 'bank' folder.


In [7]:
bank_full_path = os.path.join(bank_extraction_folder, 'bank-full.csv')

In [8]:
# Step 5: Load the data
df = pd.read_csv(bank_full_path, sep=';')

In [9]:
# Step 6: Display the first few rows to verify
print("First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [10]:
# Step 7: Select only the required columns
columns = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

In [11]:
df = df[columns]

In [12]:
# Step 8: Check if there are any missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)


Missing values in each column:
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [13]:
# Step 9: Find the most frequent observation (mode) for the 'education' column
education_mode = df['education'].mode()[0]
print(f"\nThe most frequent observation (mode) for education is: {education_mode}")


The most frequent observation (mode) for education is: secondary


In [14]:
from sklearn.model_selection import train_test_split

# Step 1: Split data into train (60%) and temp (40%) using seed 42
train_data, temp_data = train_test_split(df, test_size=0.4, random_state=42)

# Step 2: Further split temp into validation (20%) and test (20%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Verify the sizes of the splits
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

# Ensure the target value 'y' is not included in the feature set
X_train = train_data.drop(columns=['y'])
y_train = train_data['y']

X_val = val_data.drop(columns=['y'])
y_val = val_data['y']

X_test = test_data.drop(columns=['y'])
y_test = test_data['y']

# Check to confirm y has been removed from the features
print("First few rows of X_train:")
print(X_train.head())
# Select only the numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Compute the correlation matrix for the numerical features
correlation_matrix = df[numerical_features].corr()

# Display the correlation matrix
print("Correlation matrix:")
print(correlation_matrix)

# Find the two features with the highest correlation (ignoring self-correlation)
correlation_matrix_abs = correlation_matrix.abs()  # Get the absolute values
np.fill_diagonal(correlation_matrix_abs.values, 0)  # Fill diagonal with 0s to ignore self-correlation
max_correlation = correlation_matrix_abs.unstack().idxmax()

print(f"\nThe two features with the highest correlation are: {max_correlation}")

Training set size: 27126
Validation set size: 9042
Test set size: 9043
First few rows of X_train:
       age            job  marital  education  balance housing   contact  day  \
6377    45   entrepreneur  married    primary     -100     yes   unknown   27   
17236   29       services   single  secondary      166      no  cellular   28   
4490    31         admin.   single  secondary      121     yes   unknown   20   
24231   40  self-employed   single   tertiary     1693     yes  cellular   17   
3978    28       services   single  secondary      317     yes   unknown   16   

      month  duration  campaign  pdays  previous poutcome  
6377    may       240         6     -1         0  unknown  
17236   jul       108         8     -1         0  unknown  
4490    may       187         1     -1         0  unknown  
24231   nov       353         1     -1         0  unknown  
3978    may        21         3     -1         0  unknown  
Correlation matrix:
               age   balance       

In [16]:
# Step 9: One-Hot Encode categorical variables
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# One-hot encoding the categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Step 10: Ensure the target value 'y' is not included in the feature set
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

# Step 11: Split data into train (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Verify the sizes of the splits
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# Check the first few rows of the encoded data
print("First few rows of X_train after encoding:")
print(X_train.head())


Training set size: 27126
Validation set size: 9042
Test set size: 9043
First few rows of X_train after encoding:
       age  balance  day  duration  campaign  pdays  previous  \
6377    45     -100   27       240         6     -1         0   
17236   29      166   28       108         8     -1         0   
4490    31      121   20       187         1     -1         0   
24231   40     1693   17       353         1     -1         0   
3978    28      317   16        21         3     -1         0   

       job_blue-collar  job_entrepreneur  job_housemaid  ...  month_jul  \
6377             False              True          False  ...      False   
17236            False             False          False  ...       True   
4490             False             False          False  ...      False   
24231            False             False          False  ...      False   
3978             False             False          False  ...      False   

       month_jun  month_mar  month_may  month

Question 3

In [17]:
from sklearn.feature_selection import mutual_info_classif

# Step 1: Select only the categorical variables from the training set
categorical_vars = ['contact', 'education', 'housing', 'poutcome']

# We already one-hot encoded these in the previous steps, so we can select their respective one-hot encoded columns
encoded_categorical_vars = [col for col in X_train.columns if any(var in col for var in categorical_vars)]

# Step 2: Calculate the mutual information between y and each of the categorical variables
mi_scores = mutual_info_classif(X_train[encoded_categorical_vars], y_train, discrete_features=True)

# Step 3: Round the scores to 2 decimal places
mi_scores_rounded = [round(score, 2) for score in mi_scores]

# Step 4: Create a dictionary to map variables to their mutual information scores
mi_dict = dict(zip(encoded_categorical_vars, mi_scores_rounded))

# Step 5: Find the categorical variable with the highest mutual information score
highest_mi_variable = max(mi_dict, key=mi_dict.get)
highest_mi_score = mi_dict[highest_mi_variable]

# Print the mutual information scores for each categorical variable
print("Mutual Information scores for each categorical variable:")
for var, score in mi_dict.items():
    print(f"{var}: {score}")

print(f"\nThe categorical variable with the highest mutual information score is: {highest_mi_variable} with a score of {highest_mi_score}")

Mutual Information scores for each categorical variable:
education_secondary: 0.0
education_tertiary: 0.0
education_unknown: 0.0
housing_yes: 0.01
contact_telephone: 0.0
contact_unknown: 0.01
poutcome_other: 0.0
poutcome_success: 0.03
poutcome_unknown: 0.01

The categorical variable with the highest mutual information score is: poutcome_success with a score of 0.03


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Initialize the logistic regression model with the specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Step 2: Train the model on the training data (X_train and y_train)
model.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Step 4: Calculate the accuracy on the validation set
accuracy = accuracy_score(y_val, y_val_pred)

# Step 5: Round the accuracy to 2 decimal places
accuracy_rounded = round(accuracy, 2)

# Step 6: Print the accuracy on the validation dataset
print(f"Accuracy on the validation dataset: {accuracy_rounded}")


Accuracy on the validation dataset: 0.9


In [20]:
# Dictionary to store the accuracy differences
accuracy_differences = {}

# Iterate over each feature, remove the associated one-hot encoded columns, and calculate the accuracy
for feature in features_to_test:
    # Find all one-hot encoded columns related to the feature
    feature_cols = [col for col in X_train.columns if col.startswith(feature)]
    
    # Drop the feature's one-hot encoded columns from the training and validation sets
    X_train_reduced = X_train.drop(columns=feature_cols)
    X_val_reduced = X_val.drop(columns=feature_cols)
    
    # Train a new logistic regression model without this feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_reduced, y_train)
    
    # Make predictions on the validation set
    y_val_pred_reduced = model.predict(X_val_reduced)
    
    # Calculate the accuracy without this feature
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)
    
    # Calculate the difference from the original accuracy
    accuracy_diff = original_accuracy - accuracy_reduced
    accuracy_differences[feature] = round(accuracy_diff, 4)  # Round to 4 decimal places
    
    # Print the result for each feature
    print(f"Accuracy without {feature}: {round(accuracy_reduced, 2)}, Difference: {round(accuracy_diff, 4)}")

# Find the feature with the smallest difference
least_important_feature = min(accuracy_differences, key=accuracy_differences.get)
smallest_difference = accuracy_differences[least_important_feature]

print(f"\nThe feature with the smallest difference is: {least_important_feature} with a difference of {smallest_difference}")



Accuracy without age: 0.9, Difference: -0.0009
Accuracy without balance: 0.9, Difference: -0.0005
Accuracy without marital: 0.9, Difference: 0.0001
Accuracy without previous: 0.9, Difference: -0.0009

The feature with the smallest difference is: age with a difference of -0.0009


In [21]:
# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store the accuracy for each value of C
accuracy_results = {}

# Train logistic regression models for each value of C and evaluate accuracy
for C in C_values:
    # Initialize the logistic regression model with the current value of C
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Train the model using the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val)
    
    # Calculate the accuracy on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Round the accuracy to 3 decimal places
    accuracy_rounded = round(accuracy, 3)
    
    # Store the accuracy in the dictionary
    accuracy_results[C] = accuracy_rounded
    
    # Print the accuracy for the current value of C
    print(f"Accuracy with C={C}: {accuracy_rounded}")

# Find the C value with the best accuracy
best_C = max(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_C]

print(f"\nThe best C value is: {best_C} with an accuracy of {best_accuracy}")


Accuracy with C=0.01: 0.898
Accuracy with C=0.1: 0.9
Accuracy with C=1: 0.901
Accuracy with C=10: 0.9
Accuracy with C=100: 0.9

The best C value is: 1 with an accuracy of 0.901
