In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    precision_recall_curve,
    auc
)
import os

def load_data(path, target_column):
    df = pd.read_csv(path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n🔍 Evaluation for {model_name}")
    print("Confusion Matrix:\n", cm)
    print("F1 Score:", f1)
    print("AUC-PR:", auc_pr)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return {
        "model": model_name,
        "f1_score": f1,
        "auc_pr": auc_pr
    }

def train_models(X_train, y_train):
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)

    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train, y_train)

    return logreg, rf, xgb

def run_pipeline(data_path, target_col, dataset_name):
    print(f"\n===== Processing {dataset_name} Dataset =====")
    X, y = load_data(data_path, target_col)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    logreg, rf, xgb = train_models(X_train, y_train)

    results = []
    results.append(evaluate_model(logreg, X_test, y_test, "Logistic Regression"))
    results.append(evaluate_model(rf, X_test, y_test, "Random Forest"))
    results.append(evaluate_model(xgb, X_test, y_test, "XGBoost"))

    return results

if __name__ == "__main__":
    # Paths
    creditcard_path = "data/processed/creditcard_processed.csv"
    frauddata_path = "data/processed/fraud_data_processed.csv"

    creditcard_results = run_pipeline(creditcard_path, target_col="Class", dataset_name="CreditCard")
    frauddata_results = run_pipeline(frauddata_path, target_col="class", dataset_name="Fraud_Data")

    # Optional: Compare best model
    print("\n==== Summary of Results ====")
    for result in creditcard_results + frauddata_results:
        print(result)
