In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import mlflow
from mlflow.models.signature import infer_signature
import mlflow.sklearn

experiment_name = f"Train Fraud Model [xgBoost] [testing2]"
mlflow.set_experiment(experiment_name)


# 1️⃣ Load cleaned data
clean_file = Path("../../data/fsi-credit-fraud-demo/cleaned_cc_transaction_rolling_2d_window.csv")
df = pd.read_csv(clean_file)

# 2️⃣ Define target & features
TARGET = "Class"
FEATURES = [c for c in df.columns if c != TARGET]

X = df[FEATURES]
y = df[TARGET]

# 3️⃣ Train/validation split
RANDOM_STATE = 2018
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

def train_and_log(model, name):
    with mlflow.start_run(run_name=name):
        mlflow.log_params(model.get_params())
        mlflow.log_param("model_name", model.__class__.__name__)
        # Train
        model.fit(X_train, y_train)
        # Eval
        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        mlflow.log_metric("roc_auc", auc)
                
        signature = infer_signature(X_val, preds)
        input_example = X_val.iloc[:5]

        mlflow.xgboost.log_model(
            model,
            artifact_path="model",
            signature=signature,
            input_example=input_example
        )
        
        print(f"{name:12} AUC: {auc:.4f}")

# 4️⃣ XGBoost
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="auc",
    random_state=RANDOM_STATE
)
train_and_log(model, "XGBoost (good)")
