In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("AML DETECTION WITH XGBOOST")
print("="*50)

# 1. Load data
df = pd.read_csv('HI-Small_Trans.csv')
print(f"Data loaded: {df.shape[0]:,} transactions")
print(f"Target distribution:\n{df['Is Laundering'].value_counts()}")

# 2. Feature Engineering (Same as Logistic Regression)
print("\nCreating AML-specific features...")

# Convert timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

# Amount-based features
df['amount_difference'] = abs(df['Amount Received'] - df['Amount Paid'])
df['amount_ratio'] = df['Amount Received'] / (df['Amount Paid'] + 1e-8)
df['log_amount_received'] = np.log1p(df['Amount Received'])
df['log_amount_paid'] = np.log1p(df['Amount Paid'])

# Currency features
df['currency_mismatch'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)

# Bank features
df['same_bank'] = (df['From Bank'] == df['To Bank']).astype(int)

# Account features (encode account patterns)
le_account = LabelEncoder()
le_account1 = LabelEncoder()
le_payment_format = LabelEncoder()
le_recv_currency = LabelEncoder()
le_pay_currency = LabelEncoder()

df['account_encoded'] = le_account.fit_transform(df['Account'].astype(str))
df['account1_encoded'] = le_account1.fit_transform(df['Account.1'].astype(str))
df['payment_format_encoded'] = le_payment_format.fit_transform(df['Payment Format'])
df['recv_currency_encoded'] = le_recv_currency.fit_transform(df['Receiving Currency'])
df['pay_currency_encoded'] = le_pay_currency.fit_transform(df['Payment Currency'])

# Risk indicators
df['round_amount_received'] = (df['Amount Received'] % 1000 == 0).astype(int)
df['round_amount_paid'] = (df['Amount Paid'] % 1000 == 0).astype(int)
df['high_risk_hours'] = ((df['hour'] < 6) | (df['hour'] > 22)).astype(int)

# Additional XGBoost-friendly features (can capture interactions automatically)
df['bank_pair'] = df['From Bank'].astype(str) + '_' + df['To Bank'].astype(str)
df['bank_pair_encoded'] = LabelEncoder().fit_transform(df['bank_pair'])

# 3. Select features for XGBoost
feature_columns = [
    'From Bank', 'To Bank', 'bank_pair_encoded',
    'Amount Received', 'Amount Paid', 
    'log_amount_received', 'log_amount_paid',
    'amount_difference', 'amount_ratio',
    'currency_mismatch', 'same_bank',
    'hour', 'day_of_week', 'is_weekend', 'high_risk_hours',
    'round_amount_received', 'round_amount_paid',
    'payment_format_encoded', 'recv_currency_encoded', 'pay_currency_encoded',
    'account_encoded', 'account1_encoded'
]

X = df[feature_columns].copy()
y = df['Is Laundering'].copy()

print(f"Features selected: {len(feature_columns)}")
print(f"Class balance - Legitimate: {(y==0).sum():,}, Laundering: {(y==1).sum():,}")

# 4. Handle missing values and infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]:,} transactions")
print(f"Test set: {X_test.shape[0]:,} transactions")

# 6. Calculate scale_pos_weight for class imbalance
num_negative = (y_train == 0).sum()
num_positive = (y_train == 1).sum()
scale_pos_weight = num_negative / num_positive

print(f"\nClass imbalance handling:")
print(f"  • Negative samples: {num_negative:,}")
print(f"  • Positive samples: {num_positive:,}")
print(f"  • scale_pos_weight: {scale_pos_weight:.2f}")

# 7. Train XGBoost Model
print("\nTraining XGBoost...")

# XGBoost hyperparameters as per markdown guidance
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50
)

# Fit the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# 8. Make predictions
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# 9. Evaluate performance
print("\n" + "="*50)
print("XGBOOST MODEL PERFORMANCE")
print("="*50)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC Score: {auc_score:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(f"True Negatives: {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives: {cm[1,1]:,}")

# Calculate precision and recall for minority class
precision = cm[1,1] / (cm[1,1] + cm[0,1]) if (cm[1,1] + cm[0,1]) > 0 else 0
recall = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nMinority Class (Laundering) Metrics:")
print(f"  • Precision: {precision:.4f}")
print(f"  • Recall: {recall:.4f}")
print(f"  • F1-Score: {f1:.4f}")

# 10. Feature Importance
print("\n" + "="*50)
print("FEATURE IMPORTANCE (TOP 15)")
print("="*50)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows()):
    print(f"{i+1:2d}. {row['feature']:<25} (importance: {row['importance']:.4f})")

# 11. Threshold Analysis
print("\n" + "="*50)
print("THRESHOLD ANALYSIS")
print("="*50)

thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
print(f"\n🚨 ALERT VOLUME AT DIFFERENT RISK THRESHOLDS:")

threshold_results = []
for threshold in thresholds:
    thresh_pred = (y_pred_proba > threshold).astype(int)
    alerts = thresh_pred.sum()
    alert_rate = alerts / len(y_test) * 100
    
    # Calculate metrics for this threshold
    thresh_cm = confusion_matrix(y_test, thresh_pred)
    thresh_precision = thresh_cm[1,1] / (thresh_cm[1,1] + thresh_cm[0,1]) if (thresh_cm[1,1] + thresh_cm[0,1]) > 0 else 0
    thresh_recall = thresh_cm[1,1] / (thresh_cm[1,1] + thresh_cm[1,0]) if (thresh_cm[1,1] + thresh_cm[1,0]) > 0 else 0
    
    print(f"  • Threshold {threshold}: {alerts:,} alerts ({alert_rate:.2f}%) | Precision: {thresh_precision:.3f} | Recall: {thresh_recall:.3f}")
    
    threshold_results.append({
        'threshold': threshold,
        'alerts': alerts,
        'alert_rate': alert_rate,
        'precision': thresh_precision,
        'recall': thresh_recall
    })

# 12. Visualizations
plt.figure(figsize=(15, 10))

# ROC Curve
plt.subplot(2, 3, 1)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, color='blue', linewidth=2, label=f'XGBoost (AUC = {auc_score:.3f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - XGBoost AML Detection')
plt.legend()
plt.grid(True, alpha=0.3)

# Feature Importance
plt.subplot(2, 3, 2)
top_features = feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Feature Importance')
plt.gca().invert_yaxis()

# Threshold Analysis - Alert Volume
plt.subplot(2, 3, 3)
thresh_df = pd.DataFrame(threshold_results)
plt.plot(thresh_df['threshold'], thresh_df['alert_rate'], 'o-', color='orange')
plt.xlabel('Risk Threshold')
plt.ylabel('Alert Rate (%)')
plt.title('Alert Volume by Threshold')
plt.grid(True, alpha=0.3)

# Precision-Recall by Threshold
plt.subplot(2, 3, 4)
plt.plot(thresh_df['threshold'], thresh_df['precision'], 'o-', label='Precision', color='green')
plt.plot(thresh_df['threshold'], thresh_df['recall'], 'o-', label='Recall', color='red')
plt.xlabel('Risk Threshold')
plt.ylabel('Score')
plt.title('Precision vs Recall by Threshold')
plt.legend()
plt.grid(True, alpha=0.3)

# Prediction Distribution
plt.subplot(2, 3, 5)
plt.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.7, label='Legitimate', density=True)
plt.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.7, label='Laundering', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Prediction Distribution')
plt.legend()

# XGBoost built-in feature importance plot
plt.subplot(2, 3, 6)
xgb.plot_importance(xgb_model, max_num_features=10, importance_type='weight', ax=plt.gca())
plt.title('XGBoost Feature Importance (Weight)')

plt.tight_layout()
plt.show()

# 13. Business Interpretation
print("\n" + "="*50)
print("BUSINESS INTERPRETATION")
print("="*50)

total_transactions = len(y_test)
flagged_transactions = (y_pred_proba > 0.5).sum()
flagged_rate = flagged_transactions / total_transactions * 100

print(f"\n📊 OPERATIONAL METRICS:")
print(f"  • Total transactions analyzed: {total_transactions:,}")
print(f"  • Transactions flagged as suspicious: {flagged_transactions:,} ({flagged_rate:.2f}%)")
print(f"  • Model accuracy: {(y_pred == y_test).mean()*100:.2f}%")
print(f"  • AUC Score: {auc_score:.4f}")

print(f"\n💡 KEY INSIGHTS:")
top_risk_factor = feature_importance.iloc[0]
print(f"  • Most predictive feature: {top_risk_factor['feature']} (importance: {top_risk_factor['importance']:.4f})")
print(f"  • XGBoost automatically captures feature interactions")
print(f"  • Model handles class imbalance with scale_pos_weight = {scale_pos_weight:.2f}")

# Optimal threshold recommendation
optimal_idx = thresh_df[thresh_df['recall'] >= 0.8]['precision'].idxmax()
if not pd.isna(optimal_idx):
    optimal_threshold = thresh_df.loc[optimal_idx, 'threshold']
    optimal_precision = thresh_df.loc[optimal_idx, 'precision']
    optimal_recall = thresh_df.loc[optimal_idx, 'recall']
    print(f"  • Recommended threshold: {optimal_threshold} (Precision: {optimal_precision:.3f}, Recall: {optimal_recall:.3f})")

print(f"\n🎯 ADVANTAGES OVER LOGISTIC REGRESSION:")
print(f"  • No feature scaling required")
print(f"  • Automatic interaction detection")
print(f"  • Better handling of non-linear patterns")
print(f"  • Built-in feature importance ranking")
print(f"  • Early stopping prevents overfitting")