In [None]:
import pandas as pd

In [44]:
# read in data
fname = 'bank/bank-full.csv'

columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
features = list(set(columns) - set(['y']))
df = pd.read_csv(fname, delimiter=';')
df = df[columns]

### Q1

In [45]:
df['education'].mode()

0    secondary
dtype: object

### Q2

In [46]:
# compute correlation matrix
df_features = df[features]
correlation_matrix = df_features.corr()

In [47]:
correlation_matrix

Unnamed: 0,campaign,day,duration,pdays,balance,previous,age
campaign,1.0,0.16249,-0.08457,-0.088628,-0.014578,-0.032855,0.00476
day,0.16249,1.0,-0.030206,-0.093044,0.004503,-0.05171,-0.00912
duration,-0.08457,-0.030206,1.0,-0.001565,0.02156,0.001203,-0.004648
pdays,-0.088628,-0.093044,-0.001565,1.0,0.003435,0.45482,-0.023758
balance,-0.014578,0.004503,0.02156,0.003435,1.0,0.016674,0.097783
previous,-0.032855,-0.05171,0.001203,0.45482,0.016674,1.0,0.001288
age,0.00476,-0.00912,-0.004648,-0.023758,0.097783,0.001288,1.0


In [48]:
# Find the largest and smallest values
flattened_corr = correlation_matrix.unstack()
filtered_corr = flattened_corr[flattened_corr != 1]

max_corr_value = filtered_corr.max()
min_corr_value = filtered_corr.min()

max_corr_columns = filtered_corr.idxmax()
min_corr_columns = filtered_corr.idxmin()

print(f"Largest correlation: {max_corr_value} between columns {max_corr_columns}")
print(f"Smallest correlation: {min_corr_value} between columns {min_corr_columns}")

Largest correlation: 0.4548196354805043 between columns ('pdays', 'previous')
Smallest correlation: -0.09304407377294048 between columns ('day', 'pdays')


In [49]:
# Target encoding
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

In [50]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(df_features, df['y'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Q3

In [51]:
X_train['poutcome'].unique()

array(['unknown', 'other', 'failure', 'success'], dtype=object)

In [52]:
from sklearn.metrics import mutual_info_score

categoricals = ['contact', 'education', 'housing', 'poutcome']
scores = {}
for column in categoricals:
    score = mutual_info_score(df[column], df['y'])
    scores[column] = round(score, 2)
scores_df = pd.DataFrame(list(scores.items()), columns=['Feature', 'Mutual Information Score'])

# Display the mutual information scores
print(scores_df.sort_values(by='Mutual Information Score', ascending=False))


     Feature  Mutual Information Score
3   poutcome                      0.03
0    contact                      0.01
2    housing                      0.01
1  education                      0.00


### Q4

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# One-hot encode categorical columns
categoricals = ['job', 'marital', 'month', 'contact', 'education', 'housing', 'poutcome']
df_encoded = pd.get_dummies(df, columns=categoricals, drop_first=True)
X = df_encoded.drop('y', axis=1)  
y = df_encoded['y']  

# Split the data into training, testing and valuation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Calculate and print accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.90


### Q5

In [67]:
# find least important feature(s)
accuracies = {}
for feature in features:
    X_train_filtered = X_train.loc[:, ~X_train.columns.str.startswith(feature)]
    X_val_filtered = X_val.loc[:, ~X_val.columns.str.startswith(feature)]
    X_test_filtered = X_test.loc[:, ~X_test.columns.str.startswith(feature)]

    model.fit(X_train_filtered, y_train)
    y_pred = model.predict(X_val_filtered)
    new_accuracy = accuracy_score(y_val, y_pred)
    
    accuracies[feature] = new_accuracy - accuracy

accuracies_df = pd.DataFrame(list(accuracies.items()), columns=['Feature', 'Accuracy Difference'])
print(accuracies_df.sort_values(by='Accuracy Difference', ascending=False))

      Feature  Accuracy Difference
0       month             0.001880
2     housing             0.001106
7       pdays             0.000885
5   education             0.000332
11   previous             0.000332
12        age             0.000332
8     contact             0.000221
9     balance             0.000221
4         day             0.000111
10    marital             0.000000
1    campaign            -0.000111
13        job            -0.000995
6    duration            -0.006525
3    poutcome            -0.008516


### Q6

In [69]:
# One-hot encode categorical columns
categoricals = ['job', 'marital', 'month', 'contact', 'education', 'housing', 'poutcome']
df_encoded = pd.get_dummies(df, columns=categoricals, drop_first=True)
X = df_encoded.drop('y', axis=1)  
y = df_encoded['y']  

# Split the data into training, testing and valuation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a logistic regression model
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    print(f"C={C}: {accuracy}")

C=0.01: 0.898
C=0.1: 0.9
C=1: 0.9
C=10: 0.901
C=100: 0.9
