In [None]:
import io, os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.xgboost
import yaml
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix

domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")


def run_fraud_training(clean_filename: str):     
    experiment_name = f"CC Fraud Classifier Training"
    mlflow.set_experiment(experiment_name)
    
    # 1️⃣ Load cleaned data
    full_clean_file_path = f"{domino_datasource_dir}/{domino_project_name}/{clean_filename}"
    df = pd.read_csv(full_clean_file_path)
    df = df[df["Class"].notna()].copy()
    
    # 2️⃣ Define target & features
    TARGET = "Class"
    # Use all columns except Time, Amount, and Class as features
    drop_cols = ["Time", "Class"]
    FEATURES = [c for c in df.columns if c not in drop_cols]
    X = df[FEATURES]
    y = df[TARGET]
    
    # 3️⃣ Train/validation split
    RANDOM_STATE = 2018
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
    )
    print("Model input columns:", X.columns.tolist())
    
    def train_and_log(model, name):
        with mlflow.start_run(run_name=name):
            # Only log key parameters
            key_params = {
                'n_estimators': model.get_params().get('n_estimators', None),
                'learning_rate': model.get_params().get('learning_rate', None),
                'max_depth': model.get_params().get('max_depth', None),
                'subsample': model.get_params().get('subsample', None),
            }
            mlflow.log_params(key_params)
            mlflow.log_param("model_name", model.__class__.__name__)
            mlflow.log_param("clean_filename", clean_filename)
            mlflow.log_param("num_features", len(FEATURES))
            mlflow.log_param("num_rows", len(df))

            # Log human-readable parameters as YAML (only key params)
            params_yaml = {
                "model_name": model.__class__.__name__,
                "clean_filename": clean_filename,
                "num_features": len(FEATURES),
                "num_rows": len(df),
                "features": FEATURES,
                "model_params": key_params,
            }
            params_yaml_path = f"{domino_artifact_dir}/xgboost_params.yaml"
            with open(params_yaml_path, "w") as f:
                yaml.dump(params_yaml, f, default_flow_style=False)
            mlflow.log_artifact(params_yaml_path, artifact_path="params")

            start_time = time.time()
            model.fit(X_train, y_train)
            fit_time = time.time() - start_time
            preds_proba = model.predict_proba(X_val)[:, 1]
            preds_label = model.predict(X_val)
            auc = roc_auc_score(y_val, preds_proba)
            pr_auc = average_precision_score(y_val, preds_proba)
            acc = accuracy_score(y_val, preds_label)
            precision = precision_score(y_val, preds_label, pos_label=1)
            recall = recall_score(y_val, preds_label, pos_label=1)
            f1 = f1_score(y_val, preds_label, pos_label=1)
            mlflow.log_metric("roc_auc", auc)
            mlflow.log_metric("pr_auc", pr_auc)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision_fraud", precision)
            mlflow.log_metric("recall_fraud", recall)
            mlflow.log_metric("f1_fraud", f1)
            mlflow.log_metric("fit_time_sec", fit_time)

            signature = infer_signature(X_val, preds_proba)
            input_example = X_val.iloc[:5]

            mlflow.xgboost.log_model(
                model,
                artifact_path="classifier_xgboost_model",
                signature=signature,
                registered_model_name="CC Fraud XGBoost Classifier",
                input_example=input_example
            )
            mlflow.set_tag("pipeline", "classifier_training_no_pca")
            mlflow.set_tag("model", "XGBoost")

            print(f"{name:12} AUC: {auc:.4f} | PR AUC: {pr_auc:.4f} | Acc: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

            # --- PLOTS ---
            # ROC Curve
            fpr, tpr, _ = roc_curve(y_val, preds_proba)
            plt.figure(figsize=(7,5))
            plt.plot(fpr, tpr, label=f'XGBoost (AUC={auc:.3f})')
            plt.plot([0,1],[0,1],'--',color='gray')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curve')
            plt.legend()
            roc_path = f"{domino_artifact_dir}/xgboost_roc_curve.png"
            plt.savefig(roc_path); plt.close()
            mlflow.log_artifact(roc_path, artifact_path="plots")

            # Precision-Recall Curve
            precision_curve, recall_curve, _ = precision_recall_curve(y_val, preds_proba)
            plt.figure(figsize=(7,5))
            plt.plot(recall_curve, precision_curve, label='XGBoost')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision-Recall Curve')
            plt.legend()
            pr_path = f"{domino_artifact_dir}/xgboost_pr_curve.png"
            plt.savefig(pr_path); plt.close()
            mlflow.log_artifact(pr_path, artifact_path="plots")

            # Confusion Matrix
            cm = confusion_matrix(y_val, preds_label, normalize='true')
            plt.figure(figsize=(5,4))
            sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues', xticklabels=[0,1], yticklabels=[0,1])
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title('Normalized Confusion Matrix')
            cm_path = f"{domino_artifact_dir}/xgboost_confusion_matrix.png"
            plt.savefig(cm_path); plt.close()
            mlflow.log_artifact(cm_path, artifact_path="plots")

            # Feature Importances
            if hasattr(model, 'feature_importances_'):
                importances = model.feature_importances_
                indices = np.argsort(importances)[::-1][:15]
                plt.figure(figsize=(10,5))
                plt.bar(range(len(indices)), importances[indices])
                plt.xticks(range(len(indices)), [FEATURES[i] for i in indices], rotation=45, ha='right')
                plt.title('Top 15 Feature Importances (XGBoost)')
                plt.tight_layout()
                fi_path = f"{domino_artifact_dir}/xgboost_feature_importance.png"
                plt.savefig(fi_path); plt.close()
                mlflow.log_artifact(fi_path, artifact_path="plots")
    
    # 4️⃣ XGBoost
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="auc",
        random_state=RANDOM_STATE
    )
    train_and_log(model, "XGBoost (good)")

run_fraud_training(
    clean_filename="cleaned_cc_transactions.csv",
)


2025/06/22 00:39:48 INFO mlflow.tracking.fluent: Experiment with name 'CC Fraud Classifier Training' does not exist. Creating a new experiment.


Model input columns: ['Amount', 'Age', 'Tenure', 'MerchantRisk', 'DeviceTrust', 'Txn24h', 'Avg30d', 'IPReputation', 'Latitude', 'Longitude', 'DistFromHome', 'CardPresent', 'TxType_payment', 'TxType_purchase', 'TxType_transfer', 'TxType_withdrawal', 'DeviceType_ATM', 'DeviceType_POS', 'DeviceType_desktop', 'DeviceType_mobile', 'DeviceType_web', 'MerchantCat_clothing', 'MerchantCat_electronics', 'MerchantCat_entertainment', 'MerchantCat_gas', 'MerchantCat_grocery', 'MerchantCat_restaurant', 'MerchantCat_travel', 'MerchantCat_utilities', 'Channel_chip', 'Channel_contactless', 'Channel_in-store', 'Channel_online']


Registered model 'CC Fraud XGBoost Classifier [no PCA]' already exists. Creating a new version of this model...
2025/06/22 00:40:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CC Fraud XGBoost Classifier [no PCA], version 18
Created version '18' of model 'CC Fraud XGBoost Classifier [no PCA]'.


XGBoost (good) AUC: 0.9151 | PR AUC: 0.8756 | Acc: 0.8511 | Precision: 0.7785 | Recall: 0.8225 | F1: 0.7999
🏃 View run XGBoost (good) at: http://127.0.0.1:8768/#/experiments/1546/runs/f1ef8d2683b24a53a3b2ac5e395fb082
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1546
