# 02 – Feature Engineering (Fraud Detection)
**Projekt:** Secure AI Fraud Detection Pipeline  
**Zweck:** Robuste Feature-Pipeline für Fraud-Detection mit Privacy-by-Design-Grundsätzen.

**Output dieses Notebooks**
- Zeit-, Betrag-, Frequenz- und Kontext-Features
- Scaler/Encoder-Pipeline (`models/feature_pipeline.pkl`)
- Feature-Namen (`models/feature_names.json`)
- Vorverarbeitete Daten (`data/processed/features.parquet`)
- Konfig (`models/feature_config.json`)

> Hinweise:  
> - Das Notebook nutzt `data/processed/fraud_cleaned.csv` (falls vorhanden) oder `data/raw/fraud_simulated.csv`.  
> - Wenn beides fehlt, wird ein **synthetischer Demo-Datensatz** erzeugt (für reproduzierbare Läufe).

In [None]:
# Imports und Pfade
import os, json, warnings, joblib
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

PROJECT_ROOT = Path(".").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_RAW = PROJECT_ROOT / "data" / "raw"
MODELS = PROJECT_ROOT / "models"

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
DATA_RAW.mkdir(parents=True, exist_ok=True)
MODELS.mkdir(parents=True, exist_ok=True)

CLEAN_PATH = DATA_PROCESSED / "fraud_cleaned.csv"
RAW_PATH = DATA_RAW / "fraud_simulated.csv"

print(f"PROJECT_ROOT = {PROJECT_ROOT}")

## 1. Daten laden (mit Fallback & synthetischem Demo-Datensatz)

In [None]:
def _generate_synthetic(n=5000, seed=42):
    rng = np.random.default_rng(seed)
    start = datetime(2024, 1, 1)
    ts = [start + timedelta(minutes=int(x)) for x in rng.integers(0, 60*24*30, size=n)]
    amount = np.round(rng.gamma(shape=2.0, scale=50.0, size=n), 2)
    user_id = rng.integers(1000, 2000, size=n)
    country = rng.choice(["DE","AT","CH","FR","IT","ES","NL","PL","US","GB"], size=n, p=[.22,.08,.05,.12,.08,.08,.08,.09,.1,.1])
    channel = rng.choice(["app","web","pos"], size=n, p=[.4,.4,.2])
    merchant_category = rng.choice(["grocery","electronics","travel","gaming","fashion","other"], size=n)
    # Label für spätere Evaluation (optional)
    fraud = (rng.random(size=n) < (
        0.02 
        + 0.03*(country.isin(["US","GB"])) 
        + 0.02*(channel == "web") 
        + 0.04*(merchant_category == "gaming")
        + 0.03*(amount > 300)
    ).astype(float)).astype(int)
    df = pd.DataFrame({
        "timestamp": ts,
        "amount": amount,
        "user_id": user_id,
        "country": country,
        "channel": channel,
        "merchant_category": merchant_category,
        "is_fraud": fraud,
    })
    RAW_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(RAW_PATH, index=False)
    return df

def load_data():
    if CLEAN_PATH.exists():
        path = CLEAN_PATH
    elif RAW_PATH.exists():
        path = RAW_PATH
    else:
        print("Weder cleaned noch raw gefunden – generiere synthetischen Demo-Datensatz…")
        return _generate_synthetic()
    print(f"Lade Daten aus: {path}")
    return pd.read_csv(path)

df = load_data()
# Ensure timestamp dtype
if "timestamp" in df.columns:
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=False)
    except Exception:
        pass

print(df.head())
print(df.dtypes)

## 2. Basisbereinigung & Schema-Erkennung

In [None]:
# Spalten-Kandidaten (werden dynamisch geprüft)
possible_categorical = ["country","channel","merchant_category","device","currency"]
possible_numeric = ["amount","balance","tx_count_1d","tx_count_7d","avg_amount_7d"]
possible_id_like = ["user_id","account_id","customer_id","merchant_id"]
label_cols = ["is_fraud","label","y"]

# Effektiv vorhandene bestimmen
categorical_cols = [c for c in possible_categorical if c in df.columns]
numeric_cols = [c for c in possible_numeric if c in df.columns]
id_cols = [c for c in possible_id_like if c in df.columns]
label_col = next((c for c in label_cols if c in df.columns), None)

print("Gefundene Spalten:")
print("categorical:", categorical_cols)
print("numeric:", numeric_cols)
print("id-like:", id_cols)
print("label:", label_col)

## 3. Zeit-Features & Betragstransformationen

In [None]:
# Zeitmerkmale
if "timestamp" in df.columns:
    df["ts_hour"] = df["timestamp"].dt.hour
    df["ts_dayofweek"] = df["timestamp"].dt.dayofweek
    df["ts_is_weekend"] = (df["ts_dayofweek"] >= 5).astype(int)
    # Tageszeit-Bin
    df["ts_daypart"] = pd.cut(df["ts_hour"], bins=[-1,5,11,17,23], labels=["night","morning","afternoon","evening"])
    if "ts_daypart" not in categorical_cols:
        categorical_cols.append("ts_daypart")

# Betrag (robust gegen Ausreißer)
if "amount" in df.columns:
    df["amount_log1p"] = np.log1p(df["amount"])
    numeric_cols = sorted(set(numeric_cols + ["amount","amount_log1p"]))

## 4. Häufigkeits- & Nutzer-Statistik-Features

In [None]:
# Rolling/Frequenzmerkmale (vereinfachte, wenn timestamp vorhanden)
if "timestamp" in df.columns and len(id_cols)>0:
    id_col = id_cols[0]
    df = df.sort_values(["timestamp"])
    # Transaktionen pro Nutzer pro Tag
    df["date"] = df["timestamp"].dt.date
    tx_per_user_day = df.groupby([id_col,"date"])["timestamp"].transform("count")
    df["freq_user_day"] = tx_per_user_day.astype(int)
    numeric_cols = sorted(set(numeric_cols + ["freq_user_day"]))

# Ersatz für fehlende numerische Spalten
for c in numeric_cols:
    if df[c].dtype == "O":
        # versuche zu konvertieren
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Fehlwerte simple Strategie
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median()) if numeric_cols else df
for c in categorical_cols:
    df[c] = df[c].fillna("unknown")

## 5. Preprocessing-Pipeline (OneHotEncoder + RobustScaler)

In [None]:
from sklearn.utils.validation import check_is_fitted

preprocess = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), numeric_cols if len(numeric_cols)>0 else []),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols if len(categorical_cols)>0 else []),
    ],
    remainder="drop"
)

feature_config = {
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
    "label_col": label_col,
    "id_cols": id_cols
}

# X / y bilden
drop_cols = []
if label_col is not None:
    drop_cols.append(label_col)
if "timestamp" in df.columns:
    drop_cols.append("timestamp")
if "date" in df.columns:
    drop_cols.append("date")

X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
y = df[label_col] if label_col in df.columns else None

# Passende Feature-Liste für Transformer erzeugen
X_for_fit = X[numeric_cols + categorical_cols] if (numeric_cols or categorical_cols) else X.select_dtypes(include=[np.number, "object"])

Xt = preprocess.fit_transform(X_for_fit)

# Feature-Namen rekonstruieren
num_feats = numeric_cols
cat_encoder = [t for t in preprocess.transformers_ if t[0]=="cat"]
if cat_encoder and categorical_cols:
    enc = cat_encoder[0][1]
    cat_feats = list(enc.get_feature_names_out(categorical_cols))
else:
    cat_feats = []
feature_names = list(num_feats) + cat_feats

print(f"Shape (features): {Xt.shape}, Num features: {len(feature_names)}")

## 6. Artefakte & Daten exportieren

In [None]:
# Exporte
features_path = DATA_PROCESSED / "features.parquet"
feature_pipeline_path = MODELS / "feature_pipeline.pkl"
feature_names_path = MODELS / "feature_names.json"
feature_config_path = MODELS / "feature_config.json"

# Als Parquet speichern
try:
    import pyarrow  # noqa: F401
    features_df = pd.DataFrame(Xt, columns=feature_names)
    if y is not None:
        features_df[label_col] = y.values
    features_df.to_parquet(features_path, index=False)
except Exception as e:
    # Fallback zu CSV
    features_path = DATA_PROCESSED / "features.csv"
    features_df = pd.DataFrame(Xt, columns=feature_names)
    if y is not None:
        features_df[label_col] = y.values
    features_df.to_csv(features_path, index=False)

# Pipeline & Metadaten
joblib.dump(preprocess, feature_pipeline_path)
with open(feature_names_path, "w") as f:
    json.dump(feature_names, f, indent=2)
with open(feature_config_path, "w") as f:
    json.dump(feature_config, f, indent=2)

print("Gespeichert:")
print("-", features_path)
print("-", feature_pipeline_path)
print("-", feature_names_path)
print("-", feature_config_path)

## 7. Optional: Train/Test Split speichern (für 03_model_training.ipynb)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

if y is not None:
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    idx_train, idx_test = next(sss.split(Xt, y.values))
    X_train, X_test = Xt[idx_train], Xt[idx_test]
    y_train, y_test = y.values[idx_train], y.values[idx_test]

    # Parquet/CSV Export
    def _export_split(arr, path):
        try:
            import pyarrow  # noqa
            pd.DataFrame(arr).to_parquet(path, index=False)
        except Exception:
            path = path.with_suffix(".csv")
            pd.DataFrame(arr).to_csv(path, index=False)

    _export_split(X_train, DATA_PROCESSED / "X_train.parquet")
    _export_split(X_test, DATA_PROCESSED / "X_test.parquet")
    pd.Series(y_train).to_csv(DATA_PROCESSED / "y_train.csv", index=False)
    pd.Series(y_test).to_csv(DATA_PROCESSED / "y_test.csv", index=False)

    print("Train/Test Splits exportiert.")
else:
    print("Kein Label gefunden – überspringe Train/Test Split (unüberwachtes Setting).")

## 8. Kurzer Qualitätscheck
- Anzahl Features & Beispielzeilen
- Verteilung zentraler numerischer Variablen
- Cardinality der Kategorischen Variablen

In [None]:
print('Feature-Matrix Vorschau:')
display(features_df.head())

if len(feature_config.get("numeric_cols", []))>0:
    display(df[feature_config["numeric_cols"]].describe())

for c in feature_config.get("categorical_cols", []):
    print(f"Top-Kategorien für {c}:")
    print(df[c].value_counts().head(10))

---
### Nächste Schritte (für `03_model_training.ipynb`)
- IsolationForest oder andere Anomaly-Modelle auf `data/processed/X_train.parquet` trainieren
- Modell & Threshold speichern (`models/isolation_forest.joblib`, `models/threshold.json`)
- Explainability in `05_explainability_shap.ipynb` vorbereiten (auf wichtigste Merkmale)