### Set up the environment

### Q1. Pandas version

* What's the version of Pandas that you installed?

* You can get the version information using the __version__ field:

In [7]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [1]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import requests
import zipfile
import os

# URL of the dataset
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

# Download the ZIP file
response = requests.get(url)

# Save the ZIP file
with open("bank_marketing.zip", "wb") as file:
    file.write(response.content)

# Unzip the file
with zipfile.ZipFile("bank_marketing.zip", 'r') as zip_ref:
    zip_ref.extractall("bank_marketing")

print("Download and extraction complete!")

Download and extraction complete!


In [4]:
import zipfile
import os

# Specify the path to the zip file and the extraction directory
zip_file_path = './data/bank+marketing.zip'
extract_to_dir = './data'

# Check if the zip file exists
if os.path.exists(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_dir)
    print("Extraction complete.")
else:
    print(f"The file {zip_file_path} does not exist.")

The file ./data/bank+marketing.zip does not exist.


In [2]:
pd.__version__

'2.2.2'

In [8]:
df_bank = pd.read_csv('bank.csv', sep=';')
df_bank_full = pd.read_csv('bank_full.csv',sep=';')
df_bank_additional_full = pd.read_csv('bank_additional_full.csv',sep=';')
df_bank_additional = pd.read_csv('bank_additional.csv',sep=';')

In [51]:
df_bank.head().columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [52]:
df_bank.shape

(4521, 17)

In [53]:
df_bank_full.head().columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [11]:
df_bank_full.shape

(45211, 17)

### Data preparation
* Select only the features from above.
* Check if the missing values are presented in the features.

In [14]:
import pandas as pd

# Load the dataset
df = pd.read_csv('bank_full.csv', sep=';')

In [15]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [16]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [17]:
df.shape

(45211, 17)

In [18]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [21]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

###  Selecting the Required Columns

In [22]:
required_columns = [
    'age', 'job', 'marital', 'education', 'balance', 
    'housing', 'contact', 'day', 'month', 'duration', 
    'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

# Filter the DataFrame with these columns
df = df[required_columns]

In [23]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


### Check for Missing Values

In [24]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [36]:
#the required columns
required_columns = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 
    'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
    'previous', 'poutcome', 'y'
]

### Questions
* Question 1. Mode for the column `education` (1 point)

In [12]:
education_mode = df_bank_full['education'].mode()[0]
print(f"Most frequent value in education: {education_mode}")

Most frequent value in education: secondary


### Question 2. 
* Two features with the biggest correlation (1 point)

In [28]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [29]:
# Step 1: Select numerical columns
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Step 2: Create the correlation matrix
correlation_matrix = df[numerical_columns].corr()

# Step 3: Display the correlation matrix
print(correlation_matrix)

# Step 4: Find the highest correlated pairs
# Unstack the correlation matrix, sort it, and find the pair with the highest correlation
corr_pairs = correlation_matrix.unstack().sort_values(ascending=False)

# Skipping the first pair as it's the feature with itself (which will always have a correlation of 1)
high_corr = corr_pairs[corr_pairs != 1].head(1)
print(f"Two features with the highest correlation: \n{high_corr}")

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
Two features with the highest correlation: 
previous  pdays    0.45482
dtype: float64


In [30]:
# Selecting numerical columns
numerical_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Compute the correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Display the correlation matrix
print(correlation_matrix)

# Finding the two features with the highest correlation
corr_pairs = correlation_matrix.unstack().sort_values(ascending=False)
# Skipping the first as it's always 1 (correlation of feature with itself)
high_corr = corr_pairs[corr_pairs != 1].head(1)
print(high_corr)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
previous  pdays    0.45482
dtype: float64


### Step 4: Target Encoding
* Replace the target variable y values yes/no with 1/0

In [32]:
# Encoding the target variable y
df.loc[:, 'y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [34]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


### Step 5: Train-Test Split
* You need to split the data into training, validation, and test sets using a 60%/20%/20% ratio:

In [40]:
from sklearn.model_selection import train_test_split

# Splitting the data
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

# Separating the target variable 'y'
y_train = df_train.pop('y')
y_val = df_val.pop('y')
y_test = df_test.pop('y')

# Check the shapes of the splits
print(df_train.shape, df_val.shape, df_test.shape)

(27126, 14) (9042, 14) (9043, 14)


### Question 3: Mutual Information Scores
* You need to calculate the mutual information score for the categorical variables

In [41]:
from sklearn.metrics import mutual_info_score

categorical_cols = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

def calculate_mi(series):
    return mutual_info_score(series, y_train)

# Calculate mutual information for each categorical variable
mi_scores = df_train[categorical_cols].apply(calculate_mi)
mi_scores = mi_scores.sort_values(ascending=False).round(2)
print(mi_scores)

job          0.0
marital      0.0
education    0.0
housing      0.0
contact      0.0
month        0.0
poutcome     0.0
dtype: float64


### Question 4: Train Logistic Regression
* Now, you’ll train a logistic regression model. Make sure to use one-hot encoding for the categorical variables:

In [48]:
df_train[categorical_cols].columns

Index(['job', 'marital', 'education', 'housing', 'contact', 'month',
       'poutcome'],
      dtype='object')

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# One-hot encoding categorical variables
#ohe = OneHotEncoder(sparse=False, drop='first')
ohe = OneHotEncoder(sparse_output=False, drop='first')

# One-hot encoding categorical variables
ohe = OneHotEncoder(sparse_output=False, drop='first')

X_train = ohe.fit_transform(df_train[categorical_cols])
X_val = ohe.transform(df_val[categorical_cols])

# Combining with numerical features
X_train_full = pd.concat([df_train[numerical_cols].reset_index(drop=True), pd.DataFrame(X_train)], axis=1)
X_val_full = pd.concat([df_val[numerical_cols].reset_index(drop=True), pd.DataFrame(X_val)], axis=1)

# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_full, y_train)

# Predict on validation set
y_pred_val = model.predict(X_val_full)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy:.2f}")

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [86]:
numerical_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

correlation_matrix = combined_df[numerical_cols].corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the two features with the highest correlation
high_corr = correlation_matrix.unstack().sort_values(ascending=False)
# Skip diagonal correlations
high_corr = high_corr[high_corr < 1]
print(f"Two features with the highest correlation: {high_corr.idxmax()}")

TypeError: float() argument must be a string or a real number, not 'NAType'

In [86]:
numerical_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

correlation_matrix = combined_df[numerical_cols].corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the two features with the highest correlation
high_corr = correlation_matrix.unstack().sort_values(ascending=False)
# Skip diagonal correlations
high_corr = high_corr[high_corr < 1]
print(f"Two features with the highest correlation: {high_corr.idxmax()}")

TypeError: float() argument must be a string or a real number, not 'NAType'

In [86]:
numerical_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

correlation_matrix = combined_df[numerical_cols].corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the two features with the highest correlation
high_corr = correlation_matrix.unstack().sort_values(ascending=False)
# Skip diagonal correlations
high_corr = high_corr[high_corr < 1]
print(f"Two features with the highest correlation: {high_corr.idxmax()}")

TypeError: float() argument must be a string or a real number, not 'NAType'

* Question 3. Variable with the biggest mutual information score (1 point)

* We are interested in the following columns:

* Question 4. Accuracy on the validation dataset (1 point)

* Question 5. Feature with the smallest difference (1 point)

* Question 6. Smallest `C` that leads to the best accuracy on the validation set (1 point)