# Fraud Detection - Model Training

Train and evaluate XGBoost and PyTorch models for fraud detection.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

## 1. Load Processed Data

In [None]:
# Load data
X_train = pd.read_parquet('../data/processed/X_train.parquet')
X_test = pd.read_parquet('../data/processed/X_test.parquet')
y_train = pd.read_parquet('../data/processed/y_train.parquet')['Class']
y_test = pd.read_parquet('../data/processed/y_test.parquet')['Class']

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')
print(f'Fraud ratio (train): {y_train.mean():.2%}')

## 2. Train XGBoost Model

In [None]:
from xgboost import XGBClassifier

# Calculate scale_pos_weight for imbalanced data
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
print(f'Scale pos weight: {scale_pos_weight:.2f}')

# Train XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)
print('XGBoost model trained!')

In [None]:
# Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

print('XGBoost Results:')
print(classification_report(y_test, y_pred_xgb))
print(f'ROC-AUC: {roc_auc_score(y_test, y_proba_xgb):.4f}')

## 3. Feature Importance

In [None]:
# Feature importance
importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
importance.head(15).plot(kind='barh', x='feature', y='importance', ax=ax, color='steelblue')
ax.set_title('Top 15 Feature Importance (XGBoost)')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 4. ROC Curve

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba_xgb)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr, tpr, label=f'XGBoost (AUC = {roc_auc_score(y_test, y_proba_xgb):.4f})', color='steelblue', lw=2)
ax.plot([0, 1], [0, 1], 'k--', lw=1)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve')
ax.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 5. SHAP Explainability

In [None]:
import shap

# Create SHAP explainer
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test.head(100))

# Summary plot
shap.summary_plot(shap_values, X_test.head(100), plot_type='bar', show=False)
plt.title('SHAP Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# SHAP beeswarm plot
shap.summary_plot(shap_values, X_test.head(100), show=False)
plt.title('SHAP Values Distribution')
plt.tight_layout()
plt.show()

## 6. Save Model

In [None]:
import joblib

# Save model
joblib.dump(xgb_model, '../models/xgboost_fraud_model.pkl')
print('Model saved!')

In [None]:
print('Model training complete!')