In [None]:
"""
Fraud Detection and Risk Analytics in Financial Transactions
Tech: Python, Sklearn, XGBoost, Matplotlib

This pipeline provides:
 - Data loading (CSV with transaction features + fraud label)
 - Preprocessing (scaling, class balancing via SMOTE if needed)
 - Model training with RandomForest and XGBoost
 - Evaluation with precision, recall, F1, ROC-AUC
 - Feature importance visualization
 - Fraud risk trend visualization (matplotlib)

Usage:
 python fraud_detection_risk_analytics.py --data_file transactions.csv --label_col fraud
"""

import argparse
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from imblearn.over_sampling import SMOTE
    IMB_AVAILABLE = True
except ImportError:
    IMB_AVAILABLE = False


def load_and_prepare_data(file, label_col, test_size=0.2, scale=True, balance=True):
    df = pd.read_csv(file)
    X = df.drop(columns=[label_col])
    y = df[label_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

    if balance and IMB_AVAILABLE:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        X_train, X_test = X_train.values, X_test.values

    return X_train, X_test, y_train, y_test, list(X.columns)


def train_models(X_train, y_train):
    models = {}

    rf = RandomForestClassifier(n_estimators=200, max_depth=None, class_weight="balanced", random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    models["RandomForest"] = rf

    xgb_model = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
    )
    xgb_model.fit(X_train, y_train)
    models["XGBoost"] = xgb_model

    return models


def evaluate_model(model, X_test, y_test, model_name):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else preds

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average="binary")
    auc = roc_auc_score(y_test, probs)

    print(f"\n=== {model_name} Evaluation ===")
    print(classification_report(y_test, preds, digits=4))
    print("ROC-AUC:", round(auc, 4))

    # Confusion matrix plot
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    return {"precision": precision, "recall": recall, "f1": f1, "roc_auc": auc}


def plot_feature_importances(model, feature_names, model_name, top_n=15):
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:top_n]
        plt.figure(figsize=(8, 6))
        plt.bar(range(top_n), importances[indices], align="center")
        plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation=45, ha="right")
        plt.title(f"{model_name} Top {top_n} Feature Importances")
        plt.tight_layout()
        plt.show()


def plot_fraud_trends(df, date_col="date", label_col="fraud"):
    if date_col not in df.columns:
        print("Date column not found in dataset; skipping trend plot.")
        return
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    trend = df.groupby(df[date_col].dt.to_period("M"))[label_col].mean()
    trend.plot(kind="line", marker="o")
    plt.ylabel("Fraud Rate")
    plt.title("Fraud Trends Over Time")
    plt.show()


def main(args):
    X_train, X_test, y_train, y_test, feature_names = load_and_prepare_data(args.data_file, args.label_col)

    models = train_models(X_train, y_train)
    results = {}

    for name, model in models.items():
        results[name] = evaluate_model(model, X_test, y_test, name)
        plot_feature_importances(model, feature_names, name)

    # Fraud trend visualization if date col exists
    df = pd.read_csv(args.data_file)
    if "date" in df.columns:
        plot_fraud_trends(df, date_col="date", label_col=args.label_col)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fraud Detection and Risk Analytics")
    parser.add_argument("--data_file", type=str, required=True, help="Path to transactions CSV")
    parser.add_argument("--label_col", type=str, required=True, help="Column name for fraud label")
    args = parser.parse_args()
    main(args)
