In [1]:
import pandas as pd
import numpy as np


PATH = "../data/UCI_Credit_Card.csv"
df = pd.read_csv(PATH, sep=";")

if df.shape[1] == 1:
    col = df.columns[0]
    data = df[col].astype(str).str.split(",", expand=True)

    header = [h.strip().strip('"') for h in col.split(",")]
    data.columns = header

    df = data

print(df.shape)
df.head()


(30000, 25)


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    confusion_matrix, classification_report
)
df = df.copy()
TARGET = "default.payment.next.month"
if "ID" in df.columns:
    df = df.drop(columns=["ID"])

for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

print("Data:", df.shape)
print("X:", X.shape, "| y mean (default rate):", y.mean())

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print("\nSplits:")
print("train:", X_train.shape, "default rate:", y_train.mean())
print("val:  ", X_val.shape,   "default rate:", y_val.mean())
print("test: ", X_test.shape,  "default rate:", y_test.mean())

# baseline логистическая регрессия
baseline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced", solver="lbfgs"))
])

baseline.fit(X_train, y_train)

val_proba = baseline.predict_proba(X_val)[:, 1]
val_pred_05 = (val_proba >= 0.5).astype(int)

roc = roc_auc_score(y_val, val_proba)
pr  = average_precision_score(y_val, val_proba)
f1_05 = f1_score(y_val, val_pred_05)

print("\nValidation metrics (threshold=0.5):")
print("ROC-AUC:", roc)
print("PR-AUC: ", pr)
print("F1:     ", f1_05)
print("Confusion matrix:\n", confusion_matrix(y_val, val_pred_05))

thresholds = np.linspace(0.05, 0.95, 19)
best_t, best_f1 = None, -1.0
for t in thresholds:
    pred_t = (val_proba >= t).astype(int)
    f1 = f1_score(y_val, pred_t)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print("\nBest threshold on VAL for F1:", best_t, "| F1:", best_f1)

test_proba = baseline.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_t).astype(int)

print("\nTest metrics (threshold=best_t):")
print("ROC-AUC:", roc_auc_score(y_test, test_proba))
print("PR-AUC: ", average_precision_score(y_test, test_proba))
print("F1:     ", f1_score(y_test, test_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, test_pred))
print("\nReport:\n", classification_report(y_test, test_pred, digits=3))

Data: (30000, 24)
X: (30000, 23) | y mean (default rate): 0.2212

Splits:
train: (18000, 23) default rate: 0.2212222222222222
val:   (6000, 23) default rate: 0.22116666666666668
test:  (6000, 23) default rate: 0.22116666666666668

Validation metrics (threshold=0.5):
ROC-AUC: 0.7255086258486639
PR-AUC:  0.496234548263599
F1:      0.48376259798432253
Confusion matrix:
 [[3292 1381]
 [ 463  864]]

Best threshold on VAL for F1: 0.5499999999999999 | F1: 0.5097613882863341

Test metrics (threshold=best_t):
ROC-AUC: 0.7094250331918471
PR-AUC:  0.4921995374588172
F1:      0.5003615328994938
Confusion matrix:
 [[3926  747]
 [ 635  692]]

Report:
               precision    recall  f1-score   support

           0      0.861     0.840     0.850      4673
           1      0.481     0.521     0.500      1327

    accuracy                          0.770      6000
   macro avg      0.671     0.681     0.675      6000
weighted avg      0.777     0.770     0.773      6000



In [3]:
import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, confusion_matrix, classification_report

def eval_model(name, model, X_val, y_val, X_test, y_test):
    val_proba = model.predict_proba(X_val)[:, 1]
    test_proba = model.predict_proba(X_test)[:, 1]

    thresholds = np.linspace(0.05, 0.95, 19)
    best_t, best_f1 = None, -1.0
    for t in thresholds:
        f1 = f1_score(y_val, (val_proba >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_t = f1, t

    out = {
        "model": name,
        "best_threshold_val_f1": best_t,
        "val_roc_auc": roc_auc_score(y_val, val_proba),
        "val_pr_auc": average_precision_score(y_val, val_proba),
        "val_f1(best_t)": f1_score(y_val, (val_proba >= best_t).astype(int)),
        "test_roc_auc": roc_auc_score(y_test, test_proba),
        "test_pr_auc": average_precision_score(y_test, test_proba),
        "test_f1(best_t)": f1_score(y_test, (test_proba >= best_t).astype(int)),
    }
    return out, (val_proba, test_proba, best_t)

# main model

X_train_med = X_train.copy()
X_val_med   = X_val.copy()
X_test_med  = X_test.copy()

medians = X_train_med.median(numeric_only=True)
X_train_med = X_train_med.fillna(medians)
X_val_med   = X_val_med.fillna(medians)
X_test_med  = X_test_med.fillna(medians)

boost = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=3,
    max_iter=400,
    random_state=42
)
boost.fit(X_train_med, y_train)

# evaluate baseline and boosting 
rows = []

b_row, (b_val_proba, b_test_proba, b_t) = eval_model("LogReg baseline", baseline, X_val, y_val, X_test, y_test)
rows.append(b_row)

g_row, (g_val_proba, g_test_proba, g_t) = eval_model("HGB boosting", boost, X_val_med, y_val, X_test_med, y_test)
rows.append(g_row)

results = pd.DataFrame(rows).sort_values("test_roc_auc", ascending=False)
display(results)

print("\n Boosting: TEST confusion matrix and report (threshold from VAL) ")
test_pred = (g_test_proba >= g_t).astype(int)
print("Best threshold (VAL):", g_t)
print("Confusion matrix:\n", confusion_matrix(y_test, test_pred))
print("\nReport:\n", classification_report(y_test, test_pred, digits=3))


Unnamed: 0,model,best_threshold_val_f1,val_roc_auc,val_pr_auc,val_f1(best_t),test_roc_auc,test_pr_auc,test_f1(best_t)
1,HGB boosting,0.25,0.778421,0.542013,0.538381,0.776797,0.551262,0.542049
0,LogReg baseline,0.55,0.725509,0.496235,0.509761,0.709425,0.4922,0.500362



 Boosting: TEST confusion matrix and report (threshold from VAL) 
Best threshold (VAL): 0.25
Confusion matrix:
 [[3937  736]
 [ 560  767]]

Report:
               precision    recall  f1-score   support

           0      0.875     0.842     0.859      4673
           1      0.510     0.578     0.542      1327

    accuracy                          0.784      6000
   macro avg      0.693     0.710     0.700      6000
weighted avg      0.795     0.784     0.789      6000



Boosting заметно лучше baseline по всем ключевым метрикам