In [None]:
# Hyperparameter sweep for XGBoost Classifier
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
import mlflow
import mlflow.xgboost
import yaml
import numpy as np

# Reference variables from xgboost_fraud_training
domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")

# Load data
pca_filename = "cleaned_cc_transactions.csv"
full_clean_file_path = f"{domino_datasource_dir}/{domino_project_name}/{pca_filename}"
df = pd.read_csv(full_clean_file_path)
df = df[df["Class"].notna()].copy()

TARGET = "Class"
FEATURES = [c for c in df.columns if c != TARGET]
X = df[FEATURES]
y = df[TARGET]

RANDOM_STATE = 2018
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# Define hyperparameter grid
param_dist = {
    "n_estimators": [100, 200, 300, 400],
    "learning_rate": np.linspace(0.01, 0.2, 10),
    "max_depth": [3, 4, 5, 6, 7],
    "subsample": np.linspace(0.6, 1.0, 5),
    "colsample_bytree": np.linspace(0.6, 1.0, 5),
    "gamma": [0, 0.1, 0.2, 0.3, 0.4],
    "reg_alpha": [0, 0.01, 0.1, 1],
    "reg_lambda": [0.5, 1, 2, 5]
}

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="auc",
    random_state=RANDOM_STATE
)

mlflow.set_experiment("CC Fraud XGBoost Hyperparameter Sweep")

with mlflow.start_run(run_name="XGBoost Hyperparameter Sweep"):
    search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=25,
        scoring="roc_auc",
        n_jobs=-1,
        cv=3,
        verbose=2,
        random_state=RANDOM_STATE
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    best_params = search.best_params_

    # Evaluate on validation set
    preds_proba = best_model.predict_proba(X_val)[:, 1]
    preds_label = best_model.predict(X_val)
    auc = roc_auc_score(y_val, preds_proba)
    pr_auc = average_precision_score(y_val, preds_proba)
    acc = accuracy_score(y_val, preds_label)
    precision = precision_score(y_val, preds_label, pos_label=1)
    recall = recall_score(y_val, preds_label, pos_label=1)
    f1 = f1_score(y_val, preds_label, pos_label=1)

    # Log best params and metrics
    mlflow.log_params(best_params)
    mlflow.log_param("model_name", best_model.__class__.__name__)
    mlflow.log_param("pca_filename", pca_filename)
    mlflow.log_param("num_features", len(FEATURES))
    mlflow.log_param("num_rows", len(df))

    params_yaml = {
        "model_name": best_model.__class__.__name__,
        "pca_filename": pca_filename,
        "num_features": len(FEATURES),
        "num_rows": len(df),
        "features": FEATURES,
        "model_params": best_params,
    }
    params_yaml_path = f"{domino_artifact_dir}/xgboost_best_params.yaml"
    with open(params_yaml_path, "w") as f:
        yaml.dump(params_yaml, f, default_flow_style=False)
    mlflow.log_artifact(params_yaml_path, artifact_path="params")

    mlflow.log_metric("roc_auc", auc)
    mlflow.log_metric("pr_auc", pr_auc)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision_fraud", precision)
    mlflow.log_metric("recall_fraud", recall)
    mlflow.log_metric("f1_fraud", f1)

    mlflow.xgboost.log_model(
        best_model,
        artifact_path="classifier_xgboost_model"
    )
    mlflow.set_tag("pipeline", "hyperparameter_sweep")
    mlflow.set_tag("model", "XGBoost")

    print("Best Params:", best_params)
    print(f"Validation AUC: {auc:.4f} | PR AUC: {pr_auc:.4f} | Acc: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")


2025/06/18 21:54:16 INFO mlflow.tracking.fluent: Experiment with name 'CC Fraud XGBoost Hyperparameter Sweep' does not exist. Creating a new experiment.


Fitting 3 folds for each of 25 candidates, totalling 75 fits
