
# Lead Scoring Homework — Bank Marketing Dataset

This notebook solves the lead scoring classification task using the Bank Marketing dataset.  
Steps included:
1. Data Preparation (missing values handling and splitting)
2. ROC AUC feature importance
3. Logistic Regression model training
4. Precision, Recall, and F1-score threshold analysis
5. 5-Fold Cross-Validation evaluation
6. Hyperparameter tuning for Logistic Regression

All comments in code are in **English**, as requested.


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset
DATA_PATH = 'bank_marketing_leads.csv'  # change if needed
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
display(df.head())


In [None]:

# Data preparation: handle missing values
# For categorical features -> fill with 'NA'
# For numerical features -> fill with 0.0

df_prep = df.copy()

cat_cols = df_prep.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df_prep.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

df_prep[cat_cols] = df_prep[cat_cols].fillna('NA')
df_prep[num_cols] = df_prep[num_cols].fillna(0.0)

print("Missing values after filling:")
print(df_prep.isnull().sum().loc[lambda s: s > 0])


In [None]:

# Split dataset into Train (60%), Validation (20%), Test (20%)
TARGET = 'converted'  # update if target column has another name

if df_prep[TARGET].dtype == 'object':
    df_prep[TARGET] = df_prep[TARGET].map({'yes': 1, 'no': 0})

df_train_full, df_test = train_test_split(df_prep, test_size=0.2, random_state=1, stratify=df_prep[TARGET])
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1, stratify=df_train_full[TARGET])

print(f"Train: {df_train.shape}, Validation: {df_val.shape}, Test: {df_test.shape}")


In [None]:

from sklearn.metrics import roc_auc_score

features_num = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

auc_scores = {}
for f in features_num:
    auc = roc_auc_score(df_train[TARGET], df_train[f])
    if auc < 0.5:
        auc = roc_auc_score(df_train[TARGET], -df_train[f])
    auc_scores[f] = auc

print("AUC scores:")
print(auc_scores)

best_feature = max(auc_scores, key=auc_scores.get)
print("\nBest numerical variable:", best_feature)


In [None]:

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Prepare train and validation sets
features = [c for c in df_train.columns if c != TARGET]

dv = DictVectorizer(sparse=False)
train_dicts = df_train[features].to_dict(orient='records')
val_dicts = df_val[features].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
y_train = df_train[TARGET].values
y_val = df_val[TARGET].values

# Train Logistic Regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

from sklearn.metrics import roc_auc_score
y_val_pred = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_val_pred)
print("Validation AUC:", round(val_auc, 3))


In [None]:

import numpy as np
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt

thresholds = np.arange(0.0, 1.01, 0.01)
precisions = []
recalls = []

for t in thresholds:
    y_pred = (y_val_pred >= t).astype(int)
    precisions.append(precision_score(y_val, y_pred, zero_division=0))
    recalls.append(recall_score(y_val, y_pred))

plt.figure(figsize=(7,5))
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')
plt.legend()
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs Threshold')
plt.grid(True)
plt.show()

diff = np.abs(np.array(precisions) - np.array(recalls))
threshold_intersection = thresholds[np.argmin(diff)]
print("Precision = Recall at threshold:", round(threshold_intersection, 3))


In [None]:

f1_scores = []
for p, r in zip(precisions, recalls):
    if (p + r) == 0:
        f1_scores.append(0)
    else:
        f1_scores.append(2 * p * r / (p + r))

best_t = thresholds[np.argmax(f1_scores)]
plt.figure(figsize=(7,5))
plt.plot(thresholds, f1_scores)
plt.title('F1 Score vs Threshold')
plt.xlabel('Threshold')
plt.ylabel('F1')
plt.grid(True)
plt.show()

print("Best F1 threshold:", round(best_t, 3))


In [None]:

from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=1)
scores = []

for train_idx, val_idx in kf.split(df_train_full):
    df_t = df_train_full.iloc[train_idx]
    df_v = df_train_full.iloc[val_idx]

    X_t = dv.fit_transform(df_t[features].to_dict(orient='records'))
    X_v = dv.transform(df_v[features].to_dict(orient='records'))
    y_t = df_t[TARGET].values
    y_v = df_v[TARGET].values

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_t, y_t)
    y_v_pred = model.predict_proba(X_v)[:, 1]
    auc = roc_auc_score(y_v, y_v_pred)
    scores.append(auc)

print("AUC scores for 5 folds:", np.round(scores, 3))
print("Mean AUC:", round(np.mean(scores), 3))
print("Std deviation:", round(np.std(scores), 3))


In [None]:

C_values = [0.000001, 0.001, 1]
cv_results = []

for c in C_values:
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    fold_scores = []

    for train_idx, val_idx in kf.split(df_train_full):
        df_t = df_train_full.iloc[train_idx]
        df_v = df_train_full.iloc[val_idx]

        X_t = dv.fit_transform(df_t[features].to_dict(orient='records'))
        X_v = dv.transform(df_v[features].to_dict(orient='records'))
        y_t = df_t[TARGET].values
        y_v = df_v[TARGET].values

        model = LogisticRegression(solver='liblinear', C=c, max_iter=1000)
        model.fit(X_t, y_t)
        y_v_pred = model.predict_proba(X_v)[:, 1]
        auc = roc_auc_score(y_v, y_v_pred)
        fold_scores.append(auc)

    mean_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    cv_results.append((c, mean_score, std_score))

cv_df = pd.DataFrame(cv_results, columns=['C', 'mean_auc', 'std_auc'])
display(cv_df)

best_row = cv_df.loc[cv_df['mean_auc'].idxmax()]
print("\nBest C value:", best_row['C'])
