In [5]:
#!/usr/bin/env python3
import io, os, time, yaml, hashlib, base64, uuid
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from helpers.domino_short_id import domino_short_id
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score,
    f1_score, roc_curve, precision_recall_curve,
    confusion_matrix
)
from sklearn.ensemble import AdaBoostClassifier
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Directories
domino_working_dir    = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_data_dir       = domino_working_dir.replace("code", "data")
domino_artifact_dir   = domino_working_dir.replace("code", "artifacts")
domino_project        = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")


def run_fraud_training(clean_filename: str):
    experiment_name = f"CC Fraud Classifier Training {domino_short_id()}"
    mlflow.set_experiment(experiment_name)
    
    # 1) Load cleaned data
    clean_path = os.path.join(domino_data_dir, domino_project, clean_filename)
    df = pd.read_csv(clean_path).dropna(subset=["Class"]).copy()
    
    # 2) Features + target
    TARGET   = "Class"
    drop_cols = ["Time", "Class"]            # keep scaled Amount
    FEATURES = [c for c in df.columns if c not in drop_cols]
    X, y     = df[FEATURES], df[TARGET]

    # 3) Train/val split
    RANDOM_STATE = 2018
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y
    )
    print("Model input columns:", X.columns.tolist())
    
    def train_and_log(model, name):
        with mlflow.start_run(run_name=name):
            # Log key params
            key_params = {
                "n_estimators": model.get_params().get("n_estimators"),
                "learning_rate": model.get_params().get("learning_rate"),
            }
            mlflow.log_params(key_params)
            mlflow.log_param("model_name", model.__class__.__name__)
            mlflow.log_param("clean_filename", clean_filename)
            mlflow.log_param("num_features", len(FEATURES))
            mlflow.log_param("num_rows", len(df))

            # YAML summary
            params_yaml = {
                "model_name": model.__class__.__name__,
                "clean_filename": clean_filename,
                "num_features": len(FEATURES),
                "num_rows": len(df),
                "features": FEATURES,
                "model_params": key_params,
            }
            yaml_path = os.path.join(domino_artifact_dir, "adaboost_params.yaml")
            Path(domino_artifact_dir).mkdir(exist_ok=True, parents=True)
            with open(yaml_path, "w") as f:
                yaml.dump(params_yaml, f, default_flow_style=False)
            mlflow.log_artifact(yaml_path, artifact_path="params")

            # Fit & metrics
            start_time = time.time()
            model.fit(X_train, y_train)
            fit_time = time.time() - start_time

            proba = model.predict_proba(X_val)[:,1]
            pred  = model.predict(X_val)
            auc = roc_auc_score(y_val, proba)
            pr_auc = average_precision_score(y_val, proba)
            acc = accuracy_score(y_val, pred)
            precision = precision_score(y_val, pred, pos_label=1)
            recall = recall_score(y_val, pred, pos_label=1)
            f1 = f1_score(y_val, pred, pos_label=1)
            mlflow.log_metric("roc_auc", auc)
            mlflow.log_metric("pr_auc", pr_auc)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision_fraud", precision)
            mlflow.log_metric("recall_fraud", recall)
            mlflow.log_metric("f1_fraud", f1)
            mlflow.log_metric("fit_time_sec", fit_time)

            # Log model
            signature = infer_signature(X_val, proba)
            input_example = X_val.iloc[:5]
            mlflow.sklearn.log_model(
                model,
                artifact_path="adaboost_model",
                registered_model_name="CC Fraud AdaBoost Classifier",
                signature=signature,
                input_example=input_example
            )
            mlflow.set_tag("pipeline", "classifier_training_no_pca")
            mlflow.set_tag("model", "AdaBoost")

            print(f"{name:12} AUC: {auc:.4f} | PR AUC: {pr_auc:.4f} | Acc: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")
        mlflow.end_run()

    # Instantiate and run
    ada = AdaBoostClassifier(
        n_estimators=10,
        learning_rate=0.1,
        random_state=2018
    )
    train_and_log(ada, "AdaBoost (baseline)")

for i in range(1):
    try:
        run_fraud_training(
            clean_filename="cleaned_cc_transactions.csv"
        )
    except Exception as e:
        print(f'errord in range on {i}')
        print(e)

Model input columns: ['Amount', 'Age', 'Tenure', 'MerchantRisk', 'DeviceTrust', 'Txn24h', 'Avg30d', 'IPReputation', 'Latitude', 'Longitude', 'DistFromHome', 'CardPresent', 'TxType_payment', 'TxType_purchase', 'TxType_transfer', 'TxType_withdrawal', 'DeviceType_ATM', 'DeviceType_POS', 'DeviceType_desktop', 'DeviceType_mobile', 'DeviceType_web', 'MerchantCat_clothing', 'MerchantCat_electronics', 'MerchantCat_entertainment', 'MerchantCat_gas', 'MerchantCat_grocery', 'MerchantCat_restaurant', 'MerchantCat_travel', 'MerchantCat_utilities', 'Channel_chip', 'Channel_contactless', 'Channel_in-store', 'Channel_online']


Registered model 'CC Fraud AdaBoost Classifier' already exists. Creating a new version of this model...
2025/06/30 19:52:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CC Fraud AdaBoost Classifier, version 66
Created version '66' of model 'CC Fraud AdaBoost Classifier'.


AdaBoost (baseline) AUC: 0.8711 | PR AUC: 0.7704 | Acc: 0.8208 | Precision: 0.7996 | Recall: 0.6739 | F1: 0.7314
🏃 View run AdaBoost (baseline) at: http://127.0.0.1:8768/#/experiments/1566/runs/6a573ed1511a488b87d395910352740a
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1566
