# Problem Statement

A bank runs marketing campaigns to convince clients to subscribe to a term deposit (y = yes/no). You must build a model that predicts whether a client will subscribe based on demographic and campaign attributes. Since false positives and false negatives have different business costs, your evaluation must go beyond accuracy.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
df.isna().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


In [9]:
X = df.iloc[:, 2:-1].values
y = df.iloc[:, 1].values

# encoding
from sklearn.preprocessing import LabelEncoder

X = pd.get_dummies(df.drop('y', axis=1), drop_first=True)
y = LabelEncoder().fit_transform(df['y'])



In [11]:
# splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42,stratify=y)

# feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [12]:
from sklearn.linear_model import LogisticRegression
log_class = LogisticRegression()
log_class.fit(X_train, y_train)

In [13]:
y_pred = log_class.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Calculate metrics
cm = confusion_matrix(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Confusion Matrix:')
print(cm)
print(f'Precision: {prec:.4f}')
print(f'Recall:    {rec:.4f}')
print(f'F1 Score:  {f1:.4f}')


Confusion Matrix:
[[7755  197]
 [ 718  373]]
Precision: 0.6544
Recall:    0.3419
F1 Score:  0.4491


# Threshold analysis

In [20]:
y_prob = log_class.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, y_prob)
print(f'ROC AUC Score: {roc:.4f}')

ROC AUC Score: 0.9046


In [21]:
#at 0.5
threshold_05 = 0.5
y_pred_05 = (y_prob >= threshold_05).astype(int)

from sklearn.metrics import confusion_matrix

cm_05 = confusion_matrix(y_test, y_pred_05)
tn, fp, fn, tp = cm_05.ravel()

sensitivity_05 = tp / (tp + fn)
specificity_05 = tn / (tn + fp)

print("Threshold = 0.5")
print(cm_05)
print(f"Sensitivity (Recall): {sensitivity_05:.4f}")
print(f"Specificity:          {specificity_05:.4f}")


Threshold = 0.5
[[7755  197]
 [ 718  373]]
Sensitivity (Recall): 0.3419
Specificity:          0.9752


In [22]:
#optimized threshold
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = []

for t in thresholds:
    y_pred_t = (y_prob >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_t))

best_threshold = thresholds[np.argmax(f1_scores)]
print("Optimized Threshold:", best_threshold)


Optimized Threshold: 0.16999999999999998


In [23]:
#metrics at optimised threshold
y_pred_opt = (y_prob >= best_threshold).astype(int)

cm_opt = confusion_matrix(y_test, y_pred_opt)
tn, fp, fn, tp = cm_opt.ravel()

print(cm_opt)
print("Precision:", precision_score(y_test, y_pred_opt))
print("Recall:", recall_score(y_test, y_pred_opt))
print("F1:", f1_score(y_test, y_pred_opt))


[[7122  830]
 [ 297  794]]
Precision: 0.48891625615763545
Recall: 0.7277726856095326
F1: 0.5848987108655617


In [24]:
roc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", roc)


ROC-AUC: 0.9045509306987665


In [27]:
output = pd.DataFrame({
    "RecordId": range(len(y_prob)),
    "Probability_yes": y_prob,
    "PredictedLabel": y_pred_opt
})

output.to_csv("probabilities.csv", index=False)



# Why ROC curve is useful

ROC curve evaluates model performance across all possible thresholds by plotting True Positive Rate against False Positive Rate. It is threshold-independent and helps assess classifier quality even when class distribution is imbalanced.

# Threshold trade off

Increasing the classification threshold improves precision but reduces recall, while lowering the threshold improves recall at the cost of precision. This trade off is important in banking since false positives and false negatives have different business costs.