# üîí FraudGuard Training Notebook

**AD-RL-GNN Fraud Detection** | Full training pipeline with mini-batch processing

This notebook trains the FraudGuard model on the IEEE-CIS fraud detection dataset using:
- **NeighborLoader** for memory-efficient mini-batch training
- **FAISS** for similarity graph construction (GPU if available, CPU fallback)
- **FocalLoss** for class-imbalanced learning

## 1Ô∏è‚É£ Setup Environment

In [None]:
# Mount Google Drive for data storage
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone repository
!git clone https://github.com/govind104/fraudguard.git
%cd fraudguard

In [None]:
# Install dependencies
# Note: faiss-gpu may not be available on Python 3.12
# The code will fallback to faiss-cpu automatically
# GNN training STILL runs on GPU - only graph building uses CPU FAISS
!pip install -q torch torch-geometric pandas numpy scikit-learn pyyaml structlog

# Try faiss-gpu first, fallback to faiss-cpu
import subprocess
result = subprocess.run(['pip', 'install', '-q', 'faiss-gpu'], capture_output=True)
if result.returncode != 0:
    print('‚ö†Ô∏è faiss-gpu not available, using faiss-cpu')
        print('   (Graph building on CPU, but GNN training still runs on GPU!)')
            !pip install -q faiss-cpu
            else:
                print('‚úì faiss-gpu installed')

                # Installing torch-scatter and torch-sparse for NeighborLoader
                import torch

                # 1. Get exact versions
                pt_version = torch.__version__.split('+')[0]  # e.g., 2.5.1
                cuda_version = "cu" + torch.version.cuda.replace('.', '')  # e.g., cu124
                wheel_url = f"https://data.pyg.org/whl/torch-{pt_version}+{cuda_version}.html"

                print(f"PyTorch: {pt_version}, CUDA: {cuda_version}")
                print(f"Downloading from: {wheel_url}")

                # 2. Install with visible output (force reinstall to fix broken partial installs)
                !pip install --force-reinstall torch-scatter torch-sparse -f $wheel_url

                # Install repo in editable mode
                !pip install -e .

                print('\n‚úì Environment setup complete')

In [None]:
try:
    import torch_scatter
    import torch_sparse
    import fraudguard
    print("‚úÖ Success! Libraries are installed and loaded.")
except ImportError as e:
    print(f"‚ùå Still missing libraries: {e}")
    # Only if you see this error should you go back and install again.

## 2Ô∏è‚É£ Configuration

In [None]:
import os
import gc
import sys
import time
import numpy as np
import faiss

# Data paths - Point to your Google Drive folders
DATA_DIR = "/content/drive/MyDrive/ieee-fraud-detection"
MODELS_DIR = "/content/drive/MyDrive/fraudguard-models"
LOGS_DIR = "/content/drive/MyDrive/fraudguard-logs"

# Training parameters
SAMPLE_FRAC = 1.0           # Use full dataset (1.0 = 100%)
MAX_EPOCHS = 30
BATCH_SIZE = 2048           # Batch size for NeighborLoader
NUM_NEIGHBORS = [25, 10]    # 2-hop neighborhood sampling
LEARNING_RATE = 0.003       # Adam learning rate
FRAUD_WEIGHT = 25.0         # Class weight for fraud (minority class)
GRADIENT_CLIP = 1.0         # Max gradient norm

# MCD Alpha values for A/B comparison
BASELINE_ALPHA = 0.0        # No MCD for baseline
GOLD_ALPHA = 0.80           # Aggressive MCD for AD-RL-GNN

# Create directories
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

print(f"Data: {DATA_DIR}")
print(f"Models: {MODELS_DIR}")
print(f"Logs: {LOGS_DIR}")
print(f"\nBatch size: {BATCH_SIZE}")
print(f"Sample fraction: {SAMPLE_FRAC*100:.0f}%")

## 3Ô∏è‚É£ Verify GPU and FAISS

In [None]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print("\n‚úì GNN training will run on GPU")
else:
    print("\n‚ö†Ô∏è WARNING: No GPU detected. Go to Runtime > Change runtime type > GPU")

# Check FAISS GPU
faiss_gpus = faiss.get_num_gpus() if hasattr(faiss, 'get_num_gpus') else 0
print(f"\nFAISS GPUs: {faiss_gpus}")
if faiss_gpus == 0:
    print("   (Using CPU FAISS for graph building - this is OK)")

## 4Ô∏è‚É£ Load and Preprocess Data

In [None]:
sys.path.insert(0, '/content/fraudguard')

from pathlib import Path
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from src.data.loader import FraudDataLoader
from src.utils.config import load_data_config, load_model_config
from src.utils.device_utils import set_seed, get_device
from src.training.trainer import FraudTrainer
from src.training.evaluator import Evaluator

set_seed(42)
device = get_device()
print(f"Using device: {device}")

# Load config and override path with notebook variable
data_cfg = load_data_config()
data_cfg.paths.raw_data_dir = Path(DATA_DIR)

# Load data with corrected path
loader = FraudDataLoader(config=data_cfg)
df = loader.load_train_data(sample_frac=SAMPLE_FRAC)
train_df, val_df, test_df = loader.create_splits(df)

print(f"\nData loaded:")
print(f"  Train: {len(train_df):,}")
print(f"  Val: {len(val_df):,}")
print(f"  Test: {len(test_df):,}")
print(f"  Fraud rate: {df['isFraud'].mean()*100:.2f}%")

# Initialize evaluator for metrics computation
evaluator = Evaluator()

## 5Ô∏è‚É£ Run Full AD-RL-GNN Pipeline

We use the `FraudTrainer` class to orchestrate the full pipeline, including:
1. **AdaptiveMCD**: Intelligent majority downsampling
2. **RL Agent**: Dynamic subgraph selection (Random Walk, K-Hop, K-Ego)
3. **Graph Enhancement**: Adding semantic edges
4. **GNN Training**: CrossEntropyLoss (15x weight)

# Vanilla Baseline GNN

In [None]:
model_cfg = load_model_config()
data_cfg = load_data_config()
data_cfg.paths.raw_data_dir = Path(DATA_DIR)

# Aligned Hyperparameters (Strict Ceteris Paribus)
model_cfg.training["max_epochs"] = MAX_EPOCHS
model_cfg.training["learning_rate"] = LEARNING_RATE
model_cfg.adaptive_mcd["alpha"] = BASELINE_ALPHA  # No MCD
model_cfg.graph.similarity_threshold = 0.75

print("Initializing Vanilla Baseline (No MCD, No RL)...")
trainer = FraudTrainer(model_config=model_cfg, data_config=data_cfg, device=device)

# Preprocess
trainer._preprocess(train_df, val_df, test_df)
trainer._build_graph()
trainer._prepare_labels(train_df, val_df, test_df)

# Reset VRAM Monitor
torch.cuda.reset_peak_memory_stats()

# Apply class weight
weights = torch.tensor([1.0, FRAUD_WEIGHT]).to(device)
trainer._init_model()
trainer.criterion = torch.nn.CrossEntropyLoss(weight=weights)
model = trainer.model
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

# Loaders
optimized_data = Data(x=trainer.X_full, edge_index=trainer.edge_index, y=trainer.all_labels)
optimized_data.train_mask = trainer.train_mask
optimized_data.val_mask = trainer.val_mask
train_loader = NeighborLoader(optimized_data, num_neighbors=NUM_NEIGHBORS, batch_size=BATCH_SIZE, input_nodes=optimized_data.train_mask, shuffle=True)
val_loader = NeighborLoader(optimized_data, num_neighbors=NUM_NEIGHBORS, batch_size=BATCH_SIZE, input_nodes=optimized_data.val_mask, shuffle=False)

print(f"\n Starting Baseline Training ({MAX_EPOCHS} Epochs)...")
best_gmeans_baseline = 0

for epoch in range(MAX_EPOCHS):
    model.train()
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index)
        loss = trainer.criterion(out[:batch.batch_size], batch.y[:batch.batch_size])

        # Gradient clipping
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=GRADIENT_CLIP)
        optimizer.step()

    # Eval every epoch
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            out = model(batch.x, batch.edge_index)
            pred = out[:batch.batch_size].argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_true.extend(batch.y[:batch.batch_size].cpu().numpy())

    metrics = evaluator.compute_metrics(np.array(all_true), np.array(all_preds))
    gmeans = metrics['gmeans']
    print(f"Baseline Epoch {epoch+1:>2} | Spec: {metrics['specificity']*100:.2f}% | Recall: {metrics['recall']*100:.2f}% | G-Means: {gmeans*100:.2f}%")

    if gmeans > best_gmeans_baseline:
        best_gmeans_baseline = gmeans
        torch.save(model.state_dict(), f"{MODELS_DIR}/fraudguard_baseline.pt")

# Capture Baseline Metrics
baseline_vram = torch.cuda.max_memory_allocated() / 1e9

model.load_state_dict(torch.load(f"{MODELS_DIR}/fraudguard_baseline.pt"))
model.eval()
latencies_baseline = []
with torch.no_grad():
    for batch in val_loader:
        batch = batch.to(device)
        start = time.perf_counter()
        _ = model(batch.x, batch.edge_index)
        latencies_baseline.append((time.perf_counter() - start) * 1000)

baseline_p95 = np.percentile(latencies_baseline, 95)

print(f"\n‚úÖ Baseline VRAM: {baseline_vram:.2f} GB")
print(f"‚úÖ Baseline P95 Latency: {baseline_p95:.2f} ms")
print(f"üèÅ Baseline Best G-Means: {best_gmeans_baseline*100:.2f}%")

# Clean up
del model, trainer, optimized_data, train_loader, val_loader
torch.cuda.empty_cache()
gc.collect()

# Improved AD-RL-GNN

In [None]:
model_cfg = load_model_config()
data_cfg = load_data_config()
data_cfg.paths.raw_data_dir = Path(DATA_DIR)

# Aligned Hyperparameters (Strict Ceteris Paribus)
model_cfg.training["max_epochs"] = MAX_EPOCHS
model_cfg.training["learning_rate"] = LEARNING_RATE
model_cfg.rl_agent["reward_scaling"] = 2.0
model_cfg.adaptive_mcd["alpha"] = GOLD_ALPHA  # Aggressive cleaning
model_cfg.graph.similarity_threshold = 0.75

print(f"\nüöÄ Initializing AD-RL (MCD=ON, RL=ON)...")
trainer = FraudTrainer(model_config=model_cfg, data_config=data_cfg, device=device)

# Reset Stats
torch.cuda.reset_peak_memory_stats()

# Re-process
trainer._preprocess(train_df, val_df, test_df)
trainer._build_graph()
trainer._prepare_labels(train_df, val_df, test_df)

print("\nüß† Training AdaptiveMCD (Alpha 0.80)...")
trainer._train_mcd()

print("\nü§ñ Training RL Agent...")
trainer._train_rl_and_enhance()

# VRAM Flush
print("\nüßπ Flushing VRAM before GNN Training...")
torch.cuda.empty_cache()
gc.collect()

# Apply class weight
weights = torch.tensor([1.0, FRAUD_WEIGHT]).to(device)
trainer._init_model()
trainer.criterion = torch.nn.CrossEntropyLoss(weight=weights)
model = trainer.model
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

# Loaders
optimized_data = Data(x=trainer.X_full, edge_index=trainer.edge_index, y=trainer.all_labels)
optimized_data.train_mask = trainer.train_mask
optimized_data.val_mask = trainer.val_mask
train_loader = NeighborLoader(optimized_data, num_neighbors=NUM_NEIGHBORS, batch_size=BATCH_SIZE, input_nodes=optimized_data.train_mask, shuffle=True)
val_loader = NeighborLoader(optimized_data, num_neighbors=NUM_NEIGHBORS, batch_size=BATCH_SIZE, input_nodes=optimized_data.val_mask, shuffle=False)

print(f"\nüöÄ Starting AD-RL Training ({MAX_EPOCHS} Epochs)...")
best_gmeans_gold = 0

for epoch in range(MAX_EPOCHS):
    model.train()
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index)
        loss = trainer.criterion(out[:batch.batch_size], batch.y[:batch.batch_size])

        # Gradient clipping
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=GRADIENT_CLIP)
        optimizer.step()

    # Eval every epoch
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            out = model(batch.x, batch.edge_index)
            pred = out[:batch.batch_size].argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_true.extend(batch.y[:batch.batch_size].cpu().numpy())

    metrics = evaluator.compute_metrics(np.array(all_true), np.array(all_preds))
    gmeans = metrics['gmeans']
    print(f"Epoch {epoch+1:>3} | Spec: {metrics['specificity']*100:.2f}% | Recall: {metrics['recall']*100:.2f}% | G-Means: {gmeans*100:.2f}%")

    if gmeans > best_gmeans_gold:
        best_gmeans_gold = gmeans
        torch.save(model.state_dict(), f"{MODELS_DIR}/fraudguard_AD_RL.pt")

# Capture AD-RL Metrics
gold_vram = torch.cuda.max_memory_allocated() / 1e9

model.load_state_dict(torch.load(f"{MODELS_DIR}/fraudguard_AD_RL.pt"))
model.eval()
latencies_gold = []
with torch.no_grad():
    for batch in val_loader:
        batch = batch.to(device)
        start = time.perf_counter()
        _ = model(batch.x, batch.edge_index)
        latencies_gold.append((time.perf_counter() - start) * 1000)

gold_p95 = np.percentile(latencies_gold, 95)

print(f"\n‚úÖ Gold VRAM: {gold_vram:.2f} GB")
print(f"‚úÖ Gold P95 Latency: {gold_p95:.2f} ms")
print(f"üèÅ Final Best G-Means: {best_gmeans_gold*100:.2f}%")

# Clean up
del model, trainer, optimized_data, train_loader, val_loader
torch.cuda.empty_cache()
gc.collect()

## 6Ô∏è‚É£ Evaluation & Claims Verification

In [None]:
# Compute improvement using Evaluator method
gmeans_improvement = evaluator.compute_gmeans_improvement(best_gmeans_baseline, best_gmeans_gold)

print("\n" + "="*60)
print("üéØ FINAL ARCHITECTURAL COMPARISON (Scientifically Aligned)")
print("="*60)
print(f"| Metric      | Baseline | Optimized | Improvement |")
print(f"|-------------|----------|-----------|-------------|")
print(f"| G-Means     | {best_gmeans_baseline*100:.1f}%    | {best_gmeans_gold*100:.1f}%     | +{gmeans_improvement:.1f}%        |")
print(f"| P95 Latency | {baseline_p95:.1f} ms  | {gold_p95:.1f} ms   | {((baseline_p95-gold_p95)/baseline_p95)*100:.1f}%         |")
print(f"| Peak VRAM   | {baseline_vram:.1f} GB   | {gold_vram:.1f} GB    | {((baseline_vram-gold_vram)/baseline_vram)*100:.1f}%         |")
print("="*60)

In [None]:
from google.colab import runtime
runtime.unassign()