In [None]:
# Model Training Guide - Hub and Spoke Architecture

This notebook demonstrates how to train the Hub and Spoke models for the Enterprise Fraud Detection System.

## Table of Contents

1. [Training Data Preparation](#data-prep)
2. [Hub Model Training](#hub-training)
3. [Spoke Model Training](#spoke-training)
4. [Model Evaluation](#evaluation)
5. [Hyperparameter Optimization](#optimization)
6. [Model Deployment](#deployment)

---

## Overview

The training process involves:
- **Hub Model**: Trains on profile, behavioral, and network features to provide unified customer risk scores
- **Spoke Models**: Train on contextual features + hub scores to provide product-specific fraud detection

**Training Flow:**
```
Raw Data → Feature Engineering → Hub Model Training → Spoke Model Training → Evaluation → Deployment
```


In [None]:
# Setup and imports
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Add the src directory to Python path
sys.path.append('../src')

# Import our custom modules
from utils.config_manager import ConfigManager
from models.hub_model import HubModelManager, XGBoostHubModel, ModelTrainingConfig
from models.spoke_models import SpokeModelManager, PIXSpokeModel, CreditCardSpokeModel
from features.feature_store import FeatureStore

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🔧 Training Environment Setup Complete")
print(f"📅 Training Session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Initialize configuration
config = ConfigManager()
print(f"✅ Configuration loaded for environment: {config.environment}")


In [None]:
## Training Data Preparation

First, we need to prepare training datasets for both Hub and Spoke models. In a real scenario, this data would come from your data warehouse, but here we'll generate synthetic data that resembles production patterns.


In [None]:
# Generate comprehensive training data for Hub model
np.random.seed(42)

def generate_hub_training_data(n_samples=10000):
    """Generate synthetic training data for Hub model"""
    
    print(f"🏗️ Generating {n_samples:,} training samples for Hub model...")
    
    # Generate customer IDs
    customer_ids = [f'cust_{i:08d}' for i in range(n_samples)]
    
    # Pillar 1: Profile Features
    profile_features = {
        'customer_age': np.random.normal(40, 15, n_samples).clip(18, 80).astype(int),
        'account_age_days': np.random.exponential(800, n_samples).clip(1, 3650).astype(int),
        'total_products_count': np.random.poisson(2.8, n_samples).clip(1, 8),
        'credit_score_internal': np.random.normal(650, 120, n_samples).clip(300, 850).astype(int),
        'is_pep': np.random.choice([0, 1], n_samples, p=[0.98, 0.02]),
        'kyc_completion_score': np.random.beta(4, 1.5, n_samples),  # Skewed towards completion
    }
    
    # Pillar 2: Behavioral Features (various time windows)
    behavioral_features = {}
    time_windows = ['1h', '6h', '24h', '7d', '30d']
    
    for window in time_windows:
        # Transaction counts and volumes
        behavioral_features[f'transaction_count_{window}'] = np.random.poisson(
            {'1h': 0.5, '6h': 2, '24h': 8, '7d': 45, '30d': 180}[window], n_samples
        )
        
        behavioral_features[f'transaction_volume_{window}'] = np.random.lognormal(
            {'1h': 4, '6h': 5.5, '24h': 7, '7d': 9, '30d': 11}[window], 1.2, n_samples
        )
        
        behavioral_features[f'channels_used_{window}'] = np.random.poisson(
            {'1h': 1, '6h': 1.2, '24h': 1.8, '7d': 2.5, '30d': 2.8}[window], n_samples
        ).clip(0, 4)
    
    # Digital behavior
    behavioral_features.update({
        'login_count_7d': np.random.poisson(12, n_samples),
        'avg_session_duration_30d': np.random.exponential(20, n_samples),  # minutes
        'password_changes_90d': np.random.poisson(0.3, n_samples),
    })
    
    # Pillar 3: Network Features
    network_features = {
        'customers_sharing_devices': np.random.poisson(0.8, n_samples).clip(0, 20),
        'unique_devices_used': np.random.poisson(2.2, n_samples).clip(1, 10),
        'unique_beneficiaries': np.random.poisson(8, n_samples),
        'fraudulent_beneficiaries_count': np.random.poisson(0.1, n_samples),
        'network_out_degree': np.random.poisson(5, n_samples),
        'network_in_degree': np.random.poisson(3, n_samples),
    }
    
    # Combine all features
    all_features = {**profile_features, **behavioral_features, **network_features}
    
    # Create DataFrame
    df = pd.DataFrame(all_features, index=customer_ids)
    
    # Generate fraud labels based on realistic patterns
    fraud_prob = (
        0.01 +  # Base fraud rate
        0.02 * (df['credit_score_internal'] < 500) +  # Low credit score
        0.03 * (df['is_pep'] == 1) +  # PEP customers
        0.01 * (df['customers_sharing_devices'] > 5) +  # Device sharing
        0.02 * (df['fraudulent_beneficiaries_count'] > 0) +  # Risky network
        0.01 * (df['kyc_completion_score'] < 0.5) +  # Incomplete KYC
        0.02 * (df['transaction_count_24h'] > df['transaction_count_24h'].quantile(0.95))  # High activity
    )
    
    df['is_fraud'] = np.random.binomial(1, fraud_prob.clip(0, 0.8))
    
    # Add timestamp for time series split
    df['label_timestamp'] = pd.date_range(
        start='2022-01-01', 
        end='2024-01-01', 
        periods=n_samples
    )
    
    print(f"✅ Generated Hub training data:")
    print(f"   📊 Shape: {df.shape}")
    print(f"   🚨 Fraud rate: {df['is_fraud'].mean():.2%}")
    print(f"   📅 Date range: {df['label_timestamp'].min()} to {df['label_timestamp'].max()}")
    
    return df

# Generate training data
hub_training_data = generate_hub_training_data(10000)

# Display sample data
print("\n📋 Sample Hub Training Data:")
print(hub_training_data.head(10).round(3))


In [None]:
## Hub Model Training

The Hub model is trained on profile, behavioral, and network features to provide a unified customer risk assessment. This model answers the question: "What is the overall fraud risk of this customer right now?"


In [None]:
# Train Hub Model
print("🎯 Starting Hub Model Training")
print("=" * 35)

# Prepare features and target
feature_cols = [col for col in hub_training_data.columns 
                if col not in ['is_fraud', 'label_timestamp']]

X = hub_training_data[feature_cols].copy()
y = hub_training_data['is_fraud'].copy()

print(f"📊 Training set shape: {X.shape}")
print(f"🎯 Target distribution: {y.value_counts().to_dict()}")

# Time series split to respect temporal order
tscv = TimeSeriesSplit(n_splits=3)
train_idx, val_idx = list(tscv.split(X))[-1]  # Use last split

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

print(f"📈 Training set: {X_train.shape[0]:,} samples")
print(f"📊 Validation set: {X_val.shape[0]:,} samples")

# Create and train Hub model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report

# For demonstration, we'll use scikit-learn instead of the complex XGBoost setup
hub_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    verbose=0
)

print("\n🔄 Training Hub Model...")
start_time = datetime.now()

# Train the model
hub_model.fit(X_train, y_train)

training_time = (datetime.now() - start_time).total_seconds()
print(f"✅ Training completed in {training_time:.1f} seconds")

# Evaluate the model
y_train_pred = hub_model.predict_proba(X_train)[:, 1]
y_val_pred = hub_model.predict_proba(X_val)[:, 1]

train_auc = roc_auc_score(y_train, y_train_pred)
val_auc = roc_auc_score(y_val, y_val_pred)

print(f"\n📊 Hub Model Performance:")
print(f"   Training AUC: {train_auc:.4f}")
print(f"   Validation AUC: {val_auc:.4f}")
print(f"   Overfitting: {train_auc - val_auc:.4f}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': hub_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🔍 Top 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))


In [None]:
# Visualize training results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Hub Model Training Results', fontsize=16, fontweight='bold')

# 1. Feature Importance
top_features = feature_importance.head(15)
axes[0, 0].barh(range(len(top_features)), top_features['importance'])
axes[0, 0].set_yticks(range(len(top_features)))
axes[0, 0].set_yticklabels(top_features['feature'])
axes[0, 0].set_title('Top 15 Feature Importances')
axes[0, 0].set_xlabel('Importance')

# 2. Score Distribution by Class
axes[0, 1].hist(y_train_pred[y_train == 0], bins=50, alpha=0.7, label='Legitimate', density=True)
axes[0, 1].hist(y_train_pred[y_train == 1], bins=50, alpha=0.7, label='Fraud', density=True)
axes[0, 1].set_title('Hub Model Score Distribution')
axes[0, 1].set_xlabel('Fraud Score')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()

# 3. ROC Curve
from sklearn.metrics import roc_curve
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_pred)

axes[1, 0].plot(fpr_train, tpr_train, label=f'Training (AUC = {train_auc:.3f})', linewidth=2)
axes[1, 0].plot(fpr_val, tpr_val, label=f'Validation (AUC = {val_auc:.3f})', linewidth=2)
axes[1, 0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[1, 0].set_title('ROC Curve')
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Precision-Recall Curve
from sklearn.metrics import precision_recall_curve
precision_train, recall_train, _ = precision_recall_curve(y_train, y_train_pred)
precision_val, recall_val, _ = precision_recall_curve(y_val, y_val_pred)

axes[1, 1].plot(recall_train, precision_train, label='Training', linewidth=2)
axes[1, 1].plot(recall_val, precision_val, label='Validation', linewidth=2)
axes[1, 1].set_title('Precision-Recall Curve')
axes[1, 1].set_xlabel('Recall')
axes[1, 1].set_ylabel('Precision')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Model performance at different thresholds
thresholds = [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9]
performance_metrics = []

for threshold in thresholds:
    y_pred_binary = (y_val_pred >= threshold).astype(int)
    
    from sklearn.metrics import precision_score, recall_score, f1_score
    
    precision = precision_score(y_val, y_pred_binary, zero_division=0)
    recall = recall_score(y_val, y_pred_binary, zero_division=0)
    f1 = f1_score(y_val, y_pred_binary, zero_division=0)
    
    performance_metrics.append({
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })

metrics_df = pd.DataFrame(performance_metrics)
print(f"\n🎯 Performance at Different Thresholds:")
print(metrics_df.round(3))
