In [1]:
# 03_model_training.ipynb
# Purpose: Train a baseline fraud/anomaly detection model (unsupervised)
# Outputs: models/isolation_forest.pkl, models/model_metadata.json, models/thresholds.json

import json
import time
from pathlib import Path
from datetime import datetime, timezone

import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
import joblib

from pathlib import Path

root = Path(".").resolve()
hits = list(root.rglob("X_train.parquet"))
print("Found:", len(hits))
for h in hits:
    print(h)



Found: 1
C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\X_train.parquet


In [2]:
REPO_ROOT = Path("..").resolve()  # notebooks/ -> repo root
DATA_DIR = REPO_ROOT / "data"
MODELS_DIR = REPO_ROOT / "models"
REPORTS_DIR = REPO_ROOT / "reports"

PROCESSED_DIR = DATA_DIR / "processed"
ARTIFACTS_DIR = DATA_DIR / "artifacts"  # falls du artefacts so nutzt

MODELS_DIR.mkdir(exist_ok=True)
REPORTS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("REPO_ROOT:", REPO_ROOT)


REPO_ROOT: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline


In [3]:
REPO_ROOT = Path(".").resolve()        # wenn du in notebooks/ bist
DATA_DIR  = REPO_ROOT / "data"         # -> notebooks/data
PROCESSED_DIR = DATA_DIR / "processed"

X_train_path = DATA_DIR / "processed" / "X_train.parquet"
X_test_path  = DATA_DIR / "processed" / "X_test.parquet"




if not X_train_path.exists() or not X_test_path.exists():
    raise FileNotFoundError(
        "Train/Test Parquet files not found. Run notebooks/02_feature_engineering.ipynb first.\n"
        f"Missing: {X_train_path if not X_train_path.exists() else ''} {X_test_path if not X_test_path.exists() else ''}"
    )

X_train = pd.read_parquet(X_train_path)
X_test  = pd.read_parquet(X_test_path)

print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)
X_train.head()


X_train shape: (1600, 14)
X_test  shape: (400, 14)


Unnamed: 0,amount,hour,day,weekday,country_AT,country_CH,country_DE,country_ES,country_FR,country_GB,country_IT,country_NL,country_PL,country_US
0,1.099704,0.25,0.466667,-0.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.686189,0.25,0.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.723177,0.083333,-0.466667,-0.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.085596,-0.583333,-0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.421695,-0.5,-0.533333,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
def sanity_check(df: pd.DataFrame, name: str):
    if df.empty:
        raise ValueError(f"{name} is empty. Check your upstream notebooks.")
    if df.isna().any().any():
        # IsolationForest can handle NaNs poorly; we enforce none.
        na_cols = df.columns[df.isna().any()].tolist()
        raise ValueError(f"{name} contains NaNs in columns: {na_cols}")
    if not np.isfinite(df.to_numpy()).all():
        raise ValueError(f"{name} contains non-finite values (inf/-inf).")
    return True

sanity_check(X_train, "X_train")
sanity_check(X_test, "X_test")
print("Sanity checks: OK")


Sanity checks: OK


In [5]:
start = time.time()

model = IsolationForest(
    n_estimators=200,
    contamination="auto",  # wir setzen Schwellenwert später separat
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model.fit(X_train)

train_time_s = round(time.time() - start, 3)
print("Train time (s):", train_time_s)


Train time (s): 0.42


In [6]:
train_score_raw = model.score_samples(X_train)
test_score_raw  = model.score_samples(X_test)

# Risk score: higher = more anomalous
train_risk = -train_score_raw
test_risk  = -test_score_raw

print("Train risk stats:", np.min(train_risk), np.mean(train_risk), np.max(train_risk))
print("Test  risk stats:", np.min(test_risk), np.mean(test_risk), np.max(test_risk))


Train risk stats: 0.4050715939850383 0.47033638839372677 0.572984901296734
Test  risk stats: 0.41131516431710496 0.47283053129466296 0.5581420215731182


In [7]:
TOP_PCT = 0.01  # 1% Alerts, später anpassbar

threshold = float(np.quantile(train_risk, 1 - TOP_PCT))
print("Threshold (train quantile):", threshold)

# Label: 1 = flagged (high risk), 0 = not flagged
test_flagged = (test_risk >= threshold).astype(int)

print("Flagged count:", int(test_flagged.sum()), "of", len(test_flagged))


Threshold (train quantile): 0.5315013306922654
Flagged count: 3 of 400


In [8]:
label_candidates = [c for c in ["label", "is_fraud", "fraud"] if c in X_test.columns]

if label_candidates:
    y_true = X_test[label_candidates[0]].astype(int).to_numpy()
    # Wichtig: Dann darf label NICHT als Feature in X sein (Leak). 
    # Das prüfen wir:
    raise RuntimeError(
        f"Found label column '{label_candidates[0]}' inside X_test features. "
        "This indicates label leakage. Remove label from feature set in 02_feature_engineering."
    )
else:
    print("No labels found in X_test (expected for unsupervised baseline). Skipping ROC-AUC.")


No labels found in X_test (expected for unsupervised baseline). Skipping ROC-AUC.


In [9]:
model_path = MODELS_DIR / "isolation_forest.pkl"
meta_path  = MODELS_DIR / "model_metadata.json"
thr_path   = MODELS_DIR / "thresholds.json"

joblib.dump(model, model_path)

metadata = {
    "model_type": "IsolationForest",
    "trained_at_utc": datetime.now(timezone.utc).isoformat(),
    "train_shape": [int(X_train.shape[0]), int(X_train.shape[1])],
    "test_shape": [int(X_test.shape[0]), int(X_test.shape[1])],
    "random_state": RANDOM_STATE,
    "n_estimators": 200,
    "contamination": "auto",
    "train_time_s": train_time_s,
    "notes": "Unsupervised baseline. Risk score = -score_samples(). Threshold derived from train quantile."
}

thresholds = {
    "top_pct": TOP_PCT,
    "risk_threshold": threshold
}

meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
thr_path.write_text(json.dumps(thresholds, indent=2), encoding="utf-8")

print("Saved:", model_path)
print("Saved:", meta_path)
print("Saved:", thr_path)


Saved: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\models\isolation_forest.pkl
Saved: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\models\model_metadata.json
Saved: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\models\thresholds.json


In [10]:
report = {
    "train_risk": {
        "min": float(np.min(train_risk)),
        "mean": float(np.mean(train_risk)),
        "max": float(np.max(train_risk)),
    },
    "test_risk": {
        "min": float(np.min(test_risk)),
        "mean": float(np.mean(test_risk)),
        "max": float(np.max(test_risk)),
    },
    "alerts": {
        "threshold": threshold,
        "flagged_count": int(test_flagged.sum()),
        "flagged_rate": float(test_flagged.mean())
    }
}

report_path = REPORTS_DIR / "eval_summary.json"
report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
print("Saved:", report_path)


Saved: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\reports\eval_summary.json
