# 02 – Feature Engineering (Fraud Detection)

**Project:** Secure AI Fraud Detection Pipeline  
**Purpose:** Build a robust feature pipeline for fraud detection following Privacy-by-Design principles.

**Outputs of this notebook**
- Time, amount, frequency, and contextual features
- Preprocessing pipeline (`models/feature_pipeline.pkl`)
- Feature names (`models/feature_names.json`)
- Preprocessed dataset (`data/processed/features.parquet`)
- Configuration file (`models/feature_config.json`)

> Notes:  
> - The notebook first looks for `data/processed/fraud_cleaned.csv` (from notebook 01) or falls back to `data/raw/fraud_simulated.csv`.  
> - If neither is available, a **synthetic demo dataset** is generated for reproducibility.


## Block 2 – Imports & Project Paths


In [19]:
# Imports
import os, json, warnings, joblib
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# Determine project root robustly
cwd = Path.cwd()
PROJECT_ROOT = cwd if (cwd / "data").exists() else (cwd.parent if cwd.name == "notebooks" else cwd)

# Define key paths
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_RAW       = PROJECT_ROOT / "data" / "raw"
MODELS         = PROJECT_ROOT / "models"

# Ensure directories exist
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
DATA_RAW.mkdir(parents=True, exist_ok=True)
MODELS.mkdir(parents=True, exist_ok=True)

# File paths
CLEAN_PATH = DATA_PROCESSED / "fraud_cleaned.csv"
RAW_PATH   = DATA_RAW / "fraud_simulated.csv"

print(f"PROJECT_ROOT = {PROJECT_ROOT}")
print(f"CLEAN_PATH   = {CLEAN_PATH}")
print(f"RAW_PATH     = {RAW_PATH}")


PROJECT_ROOT = C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks
CLEAN_PATH   = C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\fraud_cleaned.csv
RAW_PATH     = C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\raw\fraud_simulated.csv


## Block 3 – Data loading (with fallback & synthetic demo dataset)


In [20]:
def _generate_synthetic(n: int = 5000, seed: int = 42) -> pd.DataFrame:
    """
    Generate a small synthetic fraud-like dataset for reproducible runs.
    Saves it to data/raw/fraud_simulated.csv as a convenience.
    """
    rng = np.random.default_rng(seed)
    start = datetime(2024, 1, 1)

    ts = [start + timedelta(minutes=int(x)) for x in rng.integers(0, 60*24*30, size=n)]
    amount = np.round(rng.gamma(shape=2.0, scale=50.0, size=n), 2)
    user_id = rng.integers(1000, 2000, size=n)
    country = rng.choice(
        ["DE","AT","CH","FR","IT","ES","NL","PL","US","GB"],
        size=n,
        p=[.22,.08,.05,.12,.08,.08,.08,.09,.10,.10]
    )
    channel = rng.choice(["app","web","pos"], size=n, p=[.4,.4,.2])
    merchant_category = rng.choice(["grocery","electronics","travel","gaming","fashion","other"], size=n)

    # Simple fraud label for later evaluation (optional downstream)
    fraud = (
        rng.random(size=n) < (
            0.02
            + 0.03*np.isin(country, ["US","GB"])
            + 0.02*(channel == "web")
            + 0.04*(merchant_category == "gaming")
            + 0.03*(amount > 300)
        )
    ).astype(int)

    df = pd.DataFrame({
        "timestamp": ts,
        "amount": amount,
        "user_id": user_id,
        "country": country,
        "channel": channel,
        "merchant_category": merchant_category,
        "is_fraud": fraud,
    })

    # Persist for reuse
    RAW_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(RAW_PATH, index=False)
    return df


def load_data() -> pd.DataFrame:
    """
    Load the cleaned dataset if present, otherwise raw.
    If neither exists, generate a synthetic dataset.
    """
    if CLEAN_PATH.exists():
        path = CLEAN_PATH
    elif RAW_PATH.exists():
        path = RAW_PATH
    else:
        print("No cleaned/raw CSV found – generating a synthetic demo dataset…")
        return _generate_synthetic()

    print(f"Loading data from: {path}")
    return pd.read_csv(path)


# Load and normalize timestamp dtype
df = load_data()

if "timestamp" in df.columns:
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=False)
    except Exception as e:
        print("Warning: could not parse 'timestamp' column:", e)

print(df.head())
print(df.dtypes)


Loading data from: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\fraud_cleaned.csv
            timestamp  user_id  amount country event_type  hour  day  weekday  \
0 2025-07-24 10:15:00  USER123   199.5      DE      login    10   24        3   

   amount_scaled  
0            0.0  
timestamp        datetime64[ns]
user_id                  object
amount                  float64
country                  object
event_type               object
hour                      int64
day                       int64
weekday                   int64
amount_scaled           float64
dtype: object


## Block 4 – Schema Detection (numeric, categorical, ID, and label columns)


In [21]:
# Candidate lists – extend with your project’s actual column names if needed
possible_categorical = [
    "country", "channel", "merchant_category", "device", "currency", "ts_daypart"
]
possible_numeric = [
    "amount", "balance", "tx_count_1d", "tx_count_7d", "avg_amount_7d",
    "amount_log1p", "freq_user_day"
]
possible_id_like = ["user_id", "account_id", "customer_id", "merchant_id"]
label_cols = ["is_fraud", "label", "y"]

# Detect actual columns present in df
categorical_cols = [c for c in possible_categorical if c in df.columns]
numeric_cols     = [c for c in possible_numeric     if c in df.columns]
id_cols          = [c for c in possible_id_like     if c in df.columns]
label_col        = next((c for c in label_cols if c in df.columns), None)

print("=== Schema Detection ===")
print("categorical:", categorical_cols)
print("numeric    :", numeric_cols)
print("id-like    :", id_cols)
print("label      :", label_col)

# Quick sanity stats (optional)
if numeric_cols:
    print("\nNumeric preview (describe):")
    display(df[numeric_cols].describe())

for c in categorical_cols:
    print(f"\nTop categories for '{c}':")
    print(df[c].value_counts().head(10))

# Hints if something is missing
if not categorical_cols:
    print("\nNote: No categorical columns detected. Consider adding yours to 'possible_categorical'.")
if not numeric_cols:
    print("\nNote: No numeric columns detected. Consider adding yours to 'possible_numeric'.")
if not id_cols:
    print("\nNote: No ID-like column detected. Add e.g. 'customer_id' to 'possible_id_like'.")
if label_col is None:
    print("\nNote: No label found – unsupervised setting will be supported.")


=== Schema Detection ===
categorical: ['country']
numeric    : ['amount']
id-like    : ['user_id']
label      : None

Numeric preview (describe):


Unnamed: 0,amount
count,1.0
mean,199.5
std,
min,199.5
25%,199.5
50%,199.5
75%,199.5
max,199.5



Top categories for 'country':
country
DE    1
Name: count, dtype: int64

Note: No label found – unsupervised setting will be supported.


## Block 5 – Encoding categorical features


In [26]:
# Block 5 – Encoding categorical features (version-safe + robust)

from sklearn.preprocessing import OneHotEncoder
import inspect
import numpy as np

# Pick only columns that actually exist
candidate_cats = ["country", "transaction_type", "device_type", "ts_daypart"]
categorical_cols = [c for c in candidate_cats if c in df.columns]

if not categorical_cols:
    print("No categorical columns found among:", candidate_cats)
    features_encoded = df.copy()
else:
    # Handle sklearn versions: sparse_output (>=1.2) vs sparse (<1.2)
    if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    else:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")

    # Fit/transform
    encoded = encoder.fit_transform(df[categorical_cols])

    # Build encoded DataFrame
    encoded_df = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df.index
    )

    # Join to original (drop raw categorical cols)
    features_encoded = df.drop(columns=categorical_cols).join(encoded_df)

print("Original shape:", df.shape)
print("Encoded shape :", features_encoded.shape)
features_encoded.head()


Original shape: (1, 9)
Encoded shape : (1, 9)


Unnamed: 0,timestamp,user_id,amount,event_type,hour,day,weekday,amount_scaled,country_DE
0,2025-07-24 10:15:00,USER123,199.5,login,10,24,3,0.0,1.0


## Block 6 – Scaling numeric features


In [27]:
from sklearn.preprocessing import RobustScaler
import numpy as np

# Identify label column (to exclude from scaling)
possible_labels = ["is_fraud", "label", "y"]
label_col = next((c for c in possible_labels if c in features_encoded.columns), None)

# Exclude non-scalable columns
exclude_cols = set([label_col] if label_col else []).union(
    {"timestamp", "date", "user_id", "account_id", "customer_id", "merchant_id"}
)

# Find numeric columns eligible for scaling
numeric_candidates = features_encoded.select_dtypes(include=[np.number]).columns.tolist()
numeric_to_scale = [
    c for c in numeric_candidates
    if c not in exclude_cols and not c.endswith("_scaled")
]

if not numeric_to_scale:
    print("No numeric columns to scale.")
    features_scaled = features_encoded.copy()
else:
    scaler = RobustScaler()
    scaled_array = scaler.fit_transform(features_encoded[numeric_to_scale])

    # Create new DataFrame with "_scaled" suffix
    scaled_df = pd.DataFrame(
        scaled_array,
        columns=[f"{c}_scaled" for c in numeric_to_scale],
        index=features_encoded.index
    )

    # Drop any existing "_scaled" columns before joining (avoid duplicates)
    overlapping = set(features_encoded.columns) & set(scaled_df.columns)
    if overlapping:
        features_encoded = features_encoded.drop(columns=list(overlapping))

    features_scaled = features_encoded.join(scaled_df)

    # Persist scaler for later use
    try:
        joblib.dump(scaler, MODELS / "feature_scaler.pkl")
    except Exception as e:
        print("Warning: could not save scaler:", e)

print("Before scaling shape:", features_encoded.shape)
print("After scaling shape :", features_scaled.shape)
features_scaled.head()


Before scaling shape: (1, 8)
After scaling shape : (1, 13)


Unnamed: 0,timestamp,user_id,amount,event_type,hour,day,weekday,country_DE,amount_scaled,hour_scaled,day_scaled,weekday_scaled,country_DE_scaled
0,2025-07-24 10:15:00,USER123,199.5,login,10,24,3,1.0,0.0,0.0,0.0,0.0,0.0


## Block 7 – Build preprocessing pipeline & export features/metadata


In [28]:
# Block 7 – Build preprocessing pipeline & export features/metadata
# - Creates a unified ColumnTransformer (OneHotEncoder + RobustScaler)
# - Fits on engineered dataframe `df`
# - Exports feature matrix (Parquet -> CSV fallback)
# - Persists pipeline + feature names + config for training/inference

import json, inspect
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import joblib
import numpy as np
import pandas as pd

# Re-detect label and column sets to be safe
possible_labels = ["is_fraud", "label", "y"]
label_col = next((c for c in possible_labels if c in df.columns), None)

# Categorical columns (reuse if defined earlier, else detect)
if "categorical_cols" in globals() and categorical_cols:
    cats = [c for c in categorical_cols if c in df.columns]
else:
    candidate_cats = ["country", "channel", "merchant_category", "device", "currency", "ts_daypart"]
    cats = [c for c in candidate_cats if c in df.columns]

# Numeric columns to scale (exclude IDs/timestamps/labels)
exclude = {"timestamp", "date", "user_id", "account_id", "customer_id", "merchant_id"}
if label_col: exclude.add(label_col)
nums_all = df.select_dtypes(include=[np.number]).columns.tolist()
nums = [c for c in nums_all if c not in exclude and not c.endswith("_scaled")]

# Version-safe OneHotEncoder
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

scaler = RobustScaler()

# Define ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[
        ("num", scaler, nums if nums else []),
        ("cat", ohe,   cats if cats else []),
    ],
    remainder="drop"
)

# Select fit dataframe (only the columns used by the transformer)
cols_for_fit = (nums if nums else []) + (cats if cats else [])
if not cols_for_fit:
    raise ValueError("No columns selected for preprocessing. Check your schema detection in earlier blocks.")

X_mat = preprocess.fit_transform(df[cols_for_fit])

# Build feature names
num_feature_names = nums
cat_feature_names = list(preprocess.named_transformers_["cat"].get_feature_names_out(cats)) if cats else []
feature_names = num_feature_names + cat_feature_names

# Assemble feature DataFrame
features_df = pd.DataFrame(X_mat, columns=feature_names, index=df.index)
if label_col:
    features_df[label_col] = df[label_col].values

# Export paths
features_parquet = DATA_PROCESSED / "features.parquet"
features_csv     = DATA_PROCESSED / "features.csv"
pipeline_path    = MODELS / "feature_pipeline.pkl"
featnames_path   = MODELS / "feature_names.json"
featcfg_path     = MODELS / "feature_config.json"

# Save feature matrix (Parquet preferred, CSV fallback)
try:
    import pyarrow  # noqa: F401
    features_df.to_parquet(features_parquet, index=False)
    saved_features_path = features_parquet
except Exception as e:
    print("Parquet export failed, falling back to CSV:", e)
    features_df.to_csv(features_csv, index=False)
    saved_features_path = features_csv

# Persist pipeline & metadata
joblib.dump(preprocess, pipeline_path)

with open(featnames_path, "w") as f:
    json.dump(feature_names, f, indent=2)

feature_config = {
    "categorical_cols": cats,
    "numeric_cols": nums,
    "label_col": label_col,
    "used_columns_for_fit": cols_for_fit
}
with open(featcfg_path, "w") as f:
    json.dump(feature_config, f, indent=2)

print("Saved feature matrix to:", saved_features_path)
print("Saved pipeline to:", pipeline_path)
print("Saved feature names to:", featnames_path)
print("Saved feature config to:", featcfg_path)

# Quick preview
display(features_df.head())
print("Feature matrix shape:", features_df.shape)


Saved feature matrix to: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\features.parquet
Saved pipeline to: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\models\feature_pipeline.pkl
Saved feature names to: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\models\feature_names.json
Saved feature config to: C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\models\feature_config.json


Unnamed: 0,amount,hour,day,weekday,country_DE
0,0.0,0.0,0.0,0.0,1.0


Feature matrix shape: (1, 5)


## Block 8 – Optional train/test split export (supervised or unsupervised)


In [29]:
# Block 8 – Optional train/test split export
# - If a label column exists: stratified train/test split and export
# - If no label: export a shuffled holdout split for unsupervised workflows

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

# Ensure we have the feature frame from Block 7
assert "features_df" in globals(), "features_df not found. Run Block 7 first."

# Paths
X_train_pq = DATA_PROCESSED / "X_train.parquet"
X_test_pq  = DATA_PROCESSED / "X_test.parquet"
y_train_csv = DATA_PROCESSED / "y_train.csv"
y_test_csv  = DATA_PROCESSED / "y_test.csv"

# Identify label if present
possible_labels = ["is_fraud", "label", "y"]
label_col = next((c for c in possible_labels if c in features_df.columns), None)

def _save_matrix(df: pd.DataFrame, path_parquet: Path) -> Path:
    """Save DataFrame to Parquet (preferred) with CSV fallback."""
    try:
        import pyarrow  # noqa: F401
        df.to_parquet(path_parquet, index=False)
        return path_parquet
    except Exception as e:
        csv_path = path_parquet.with_suffix(".csv")
        print(f"Parquet export failed for {path_parquet.name}; falling back to CSV:", e)
        df.to_csv(csv_path, index=False)
        return csv_path

if label_col:
    # Supervised case
    print(f"Label detected: {label_col} – creating stratified train/test split.")
    y = features_df[label_col].astype(int).values
    X = features_df.drop(columns=[label_col])

    # Guard: ensure both classes exist; if not, use standard split
    if len(np.unique(y)) > 1:
        splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        idx_train, idx_test = next(splitter.split(X, y))
    else:
        print("Only one class present; using random split without stratification.")
        idx_train, idx_test = train_test_split(
            np.arange(len(X)), test_size=0.2, random_state=42, shuffle=True
        )

    X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
    y_train, y_test = y[idx_train], y[idx_test]

    # Persist
    saved_X_train = _save_matrix(X_train, X_train_pq)
    saved_X_test  = _save_matrix(X_test,  X_test_pq)
    pd.Series(y_train, name=label_col).to_csv(y_train_csv, index=False)
    pd.Series(y_test,  name=label_col).to_csv(y_test_csv,  index=False)

    print("Saved:")
    print(" -", saved_X_train)
    print(" -", saved_X_test)
    print(" -", y_train_csv)
    print(" -", y_test_csv)
    print("Shapes:", X_train.shape, X_test.shape, " | Class balance (train/test):",
          np.mean(y_train).round(4), "/", np.mean(y_test).round(4))
else:
    # Unsupervised case
    print("No label detected – exporting unsupervised split for development convenience.")
    # Shuffle and split 80/20 on rows
    X = features_df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    n_test = max(1, int(0.2 * len(X)))
    X_train, X_test = X.iloc[:-n_test], X.iloc[-n_test:]

    saved_X_train = _save_matrix(X_train, X_train_pq)
    saved_X_test  = _save_matrix(X_test,  X_test_pq)

    print("Saved:")
    print(" -", saved_X_train)
    print(" -", saved_X_test)
    print("Shapes:", X_train.shape, X_test.shape)


No label detected – exporting unsupervised split for development convenience.
Saved:
 - C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\X_train.parquet
 - C:\Users\admin\Desktop\AI Sec Project\GitHub\secure-ai-fraud-detection-pipeline\notebooks\data\processed\X_test.parquet
Shapes: (0, 5) (1, 5)
