In [None]:
import io, os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.sklearn
import yaml
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix

domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")

def run_fraud_training(pca_filename: str):    
    experiment_name = f"CC Fraud Classifier Training [testing]"
    mlflow.set_experiment(experiment_name)
    
    # 1️⃣ Load cleaned PCA data
    full_clean_file_path = f"{domino_datasource_dir}/{domino_project_name}/{pca_filename}"
    df = pd.read_csv(full_clean_file_path)
    df = df[df["Class"].notna()].copy()

    # 2️⃣ Define target & features
    TARGET = "Class"
    FEATURES = [c for c in df.columns if c != TARGET]
    
    X = df[FEATURES]
    y = df[TARGET]
    
    # 3️⃣ Train/validation split
    RANDOM_STATE = 2018
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
    )
    
    def train_and_log(model, name):
        with mlflow.start_run(run_name=name):
            # Log model parameters
            mlflow.log_params(model.get_params())
            mlflow.log_param("model_name", model.__class__.__name__)
            mlflow.log_param("pca_filename", pca_filename)
            mlflow.log_param("num_features", len(FEATURES))
            mlflow.log_param("num_rows", len(df))

            # Log human-readable parameters as YAML
            params_yaml = {
                "model_name": model.__class__.__name__,
                "pca_filename": pca_filename,
                "num_features": len(FEATURES),
                "num_rows": len(df),
                "features": FEATURES,
                "model_params": model.get_params(),
            }
            params_yaml_path = f"{domino_artifact_dir}/adaboost_params.yaml"
            with open(params_yaml_path, "w") as f:
                yaml.dump(params_yaml, f, default_flow_style=False)
            mlflow.log_artifact(params_yaml_path, artifact_path="params")

            start_time = time.time()
            model.fit(X_train, y_train)
            fit_time = time.time() - start_time
            preds = model.predict_proba(X_val)[:, 1]
            preds_label = model.predict(X_val)
            auc = roc_auc_score(y_val, preds)
            mlflow.log_metric("roc_auc", auc)
            mlflow.log_metric("fit_time_sec", fit_time)

            signature = infer_signature(X_val, preds)
            input_example = X_val.iloc[:5]

            mlflow.sklearn.log_model(
                model,
                artifact_path="classifier_adaboost_model",
                signature=signature,
                registered_model_name="CC Fraud ADA Classifier",
                input_example=input_example
            )
            mlflow.set_tag("pipeline", "classifier_training")
            mlflow.set_tag("model", "AdaBoost")

            print(f"{name:12} AUC: {auc:.4f}")

            # --- PLOTS ---
            # ROC Curve
            fpr, tpr, _ = roc_curve(y_val, preds)
            plt.figure(figsize=(7,5))
            plt.plot(fpr, tpr, label=f'AdaBoost (AUC={auc:.3f})')
            plt.plot([0,1],[0,1],'--',color='gray')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curve')
            plt.legend()
            roc_path = f"{domino_artifact_dir}/adaboost_roc_curve.png"
            plt.savefig(roc_path); plt.close()
            mlflow.log_artifact(roc_path, artifact_path="plots")

            # Precision-Recall Curve
            precision, recall, _ = precision_recall_curve(y_val, preds)
            plt.figure(figsize=(7,5))
            plt.plot(recall, precision, label='AdaBoost')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision-Recall Curve')
            plt.legend()
            pr_path = f"{domino_artifact_dir}/adaboost_pr_curve.png"
            plt.savefig(pr_path); plt.close()
            mlflow.log_artifact(pr_path, artifact_path="plots")

            # Confusion Matrix
            cm = confusion_matrix(y_val, preds_label, normalize='true')
            plt.figure(figsize=(5,4))
            sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues', xticklabels=[0,1], yticklabels=[0,1])
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title('Normalized Confusion Matrix')
            cm_path = f"{domino_artifact_dir}/adaboost_confusion_matrix.png"
            plt.savefig(cm_path); plt.close()
            mlflow.log_artifact(cm_path, artifact_path="plots")

            # Feature Importances (if available)
            if hasattr(model, 'feature_importances_'):
                importances = model.feature_importances_
                indices = np.argsort(importances)[::-1][:15]
                plt.figure(figsize=(10,5))
                plt.bar(range(len(indices)), importances[indices])
                plt.xticks(range(len(indices)), [FEATURES[i] for i in indices], rotation=45, ha='right')
                plt.title('Top 15 Feature Importances (AdaBoost)')
                plt.tight_layout()
                fi_path = f"{domino_artifact_dir}/adaboost_feature_importance.png"
                plt.savefig(fi_path); plt.close()
                mlflow.log_artifact(fi_path, artifact_path="plots")
    
    # 4️⃣ AdaBoost
    model = AdaBoostClassifier(
        algorithm='SAMME',
        n_estimators=50, learning_rate=0.5, random_state=RANDOM_STATE
    )
    train_and_log(model, "AdaBoost (bad)")

run_fraud_training(
    pca_filename="cleaned_cc_transactions.csv",
)

