In [6]:
import io, os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.sklearn

domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")

def run_fraud_training(pca_filename: str):    
    experiment_name = f"CC Fraud Classifier Training [testing]"
    mlflow.set_experiment(experiment_name)
    
    # 1️⃣ Load cleaned PCA data
    full_clean_file_path = f"{domino_datasource_dir}/{domino_project_name}/{pca_filename}"
    df = pd.read_csv(full_clean_file_path)
    df = df[df["Class"].notna()].copy()

    # 2️⃣ Define target & features
    TARGET = "Class"
    FEATURES = [c for c in df.columns if c != TARGET]
    
    X = df[FEATURES]
    y = df[TARGET]
    
    # 3️⃣ Train/validation split
    RANDOM_STATE = 2018
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
    )
    
    def train_and_log(model, name):
        with mlflow.start_run(run_name=name):
            mlflow.log_params(model.get_params())
            mlflow.log_param("model_name", model.__class__.__name__)
            
            model.fit(X_train, y_train)
            
            preds = model.predict_proba(X_val)[:, 1]
            auc = roc_auc_score(y_val, preds)
            mlflow.log_metric("roc_auc", auc)
            
            signature = infer_signature(X_val, preds)
            input_example = X_val.iloc[:5]
    
            mlflow.sklearn.log_model(
                model,
                artifact_path="classifier_adaboost_model",
                signature=signature,
                input_example=input_example
            )
            mlflow.set_tag("pipeline", "classifier_training")
            mlflow.set_tag("model", "AdaBoost")

            print(f"{name:12} AUC: {auc:.4f}")
    
    # 4️⃣ AdaBoost
    model = AdaBoostClassifier(
        algorithm='SAMME',
        n_estimators=50, learning_rate=0.5, random_state=RANDOM_STATE
    )
    train_and_log(model, "AdaBoost (bad)")


run_fraud_training(
    pca_filename="cleaned_cc_transactions.csv",
)



AdaBoost (bad) AUC: 0.5447
🏃 View run AdaBoost (bad) at: http://127.0.0.1:8768/#/experiments/1538/runs/db96a69da3614e3bb02b06b5c1068a8c
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1538
