In [2]:
import numpy as np, pandas as pd, joblib, json
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (precision_recall_fscore_support, average_precision_score,
                             precision_recall_curve, roc_auc_score)

from xgboost import XGBClassifier
import torch, torch.nn as nn, torch.optim as optim

ART = Path("../artifacts")
ART.mkdir(exist_ok=True)
# Load data from existing artifacts directory
df = pd.read_csv(ART / "creditcard.csv")
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

TEST-TRAIN Split 

In [3]:
X = df.drop(columns=['Class'])
y = df['Class'].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

pd.concat([X_train, y_train], axis=1).to_csv(ART/"train.csv", index=False)
pd.concat([X_test, y_test], axis=1).to_csv(ART/"test.csv", index=False)


Pre-processing

In [4]:

num_cols = ['Time','Amount']
preprocessor = ColumnTransformer(
    transformers=[('scale', StandardScaler(), num_cols)],
    remainder='passthrough',  
)

X_train_p = preprocessor.fit_transform(X_train)
X_val_p   = preprocessor.transform(X_val)
X_test_p  = preprocessor.transform(X_test)

joblib.dump(preprocessor, ART/"preprocessor.pkl")


['../artifacts/preprocessor.pkl']

XGBoost

In [5]:
pos = y_train.sum()
neg = len(y_train) - pos
spw = neg / pos  

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=1,
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=spw,
    eval_metric="logloss",
)
xgb.fit(X_train_p, y_train)
xgb_val_proba = xgb.predict_proba(X_val_p)[:,1]
print("XGB PR-AUC:", average_precision_score(y_val, xgb_val_proba))
joblib.dump(xgb, ART/"xgb_model.pkl")

XGB PR-AUC: 0.8346189169786565


['../artifacts/xgb_model.pkl']

Neural Net


In [6]:
class MLP(nn.Module):
    def __init__(self, d_in):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, 64), nn.ReLU(),
            nn.BatchNorm1d(64), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1), nn.Sigmoid()
        )
    def forward(self, x): return self.net(x)

d_in = X_train_p.shape[1]
nn_model = MLP(d_in)

# class weights
pos_weight = torch.tensor([spw], dtype=torch.float32)  # weight positive errors higher
criterion = nn.BCELoss()  # alternative: BCEWithLogitsLoss(pos_weight=pos_weight) w/o Sigmoid

optimizer = optim.Adam(nn_model.parameters(), lr=1e-3)
Xtr_t = torch.tensor(X_train_p, dtype=torch.float32)
ytr_t = torch.tensor(y_train.values.reshape(-1,1), dtype=torch.float32)

for epoch in range(10):
    nn_model.train()
    optimizer.zero_grad()
    out = nn_model(Xtr_t)
    loss = criterion(out, ytr_t)
    loss.backward(); optimizer.step()
    if (epoch+1)%2==0: print(f"epoch {epoch+1}: loss {loss.item():.4f}")

# validation probabilities
nn_model.eval()
Xval_t = torch.tensor(X_val_p, dtype=torch.float32)
with torch.no_grad():
    nn_val_proba = nn_model(Xval_t).cpu().numpy().ravel()

print("NN PR-AUC:", average_precision_score(y_val, nn_val_proba))
torch.save(nn_model.state_dict(), ART/"nn_model.pth")

epoch 2: loss 0.7334
epoch 4: loss 0.7035
epoch 6: loss 0.6753
epoch 8: loss 0.6490
epoch 10: loss 0.6227
NN PR-AUC: 0.015046340858149175


Probability Calibration

In [7]:
from sklearn.linear_model import LogisticRegression

def platt_calibrate(probs, y_true):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(probs.reshape(-1,1), y_true)
    def calibrator(p):
        return lr.predict_proba(p.reshape(-1,1))[:,1]
    return calibrator, lr

xgb_cal, xgb_lr = platt_calibrate(xgb_val_proba, y_val)
nn_cal, nn_lr   = platt_calibrate(nn_val_proba, y_val)


print("Calibrated XGB PR-AUC:", average_precision_score(y_val, xgb_cal(xgb_val_proba)))
print("Calibrated NN  PR-AUC:", average_precision_score(y_val, nn_cal(nn_val_proba)))


Calibrated XGB PR-AUC: 0.8346189169786565
Calibrated NN  PR-AUC: 0.00816974137096763


Weighted Ensemble

In [13]:
def weighted_ensemble(xgb_p, nn_p, w):
    """Linear ensemble of two models."""
    return w * xgb_p + (1 - w) * nn_p


best_w, best_ap = None, -1
for w in np.linspace(0, 1, 21):
    ens_val = weighted_ensemble(xgb_cal(xgb_val_proba), nn_cal(nn_val_proba), w)
    ap = average_precision_score(y_val, ens_val)
    if ap > best_ap:
        best_ap, best_w = ap, w

print(f"Best ensemble weight (XGB={best_w:.2f}, NN={1-best_w:.2f}), PR-AUC={best_ap:.4f}")


val_probs_ens = weighted_ensemble(xgb_cal(xgb_val_proba), nn_cal(nn_val_proba), best_w)

Best ensemble weight (XGB=1.00, NN=0.00), PR-AUC=0.8346
