# Model Training with MLflow Integration

This notebook extends the original credit risk prediction model with MLflow tracking capabilities.

In [4]:
import os
import ray
import glob
import eli5
import mlflow
import xgboost_ray as xgbr
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error
from ray import tune

# Set up MLflow tracking URI - replace with your MLflow server URL
#mlflow.set_tracking_uri('http://your-mlflow-server:5000')
#mlflow.set_experiment('credit-risk-prediction')

# Enable MLflow autologging for XGBoost
mlflow.xgboost.autolog()

DATA_ROOT = os.path.join("/mnt/data", os.environ["DOMINO_PROJECT_NAME"], "data") 
MODEL_ROOT = "/mnt/artifacts"
TUNE_ROOT = os.path.join("/mnt/data", os.environ["DOMINO_PROJECT_NAME"], "ray_results")

In [5]:
# Ray setup
RAY_ACTORS = 3
RAY_CPUS_PER_ACTOR = 4

if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [6]:
train_files = glob.glob(os.path.join(DATA_ROOT, "train_data*"))
val_files = glob.glob(os.path.join(DATA_ROOT, "validation_data*"))
test_file = os.path.join(DATA_ROOT, "test_data.csv")
target_col = "credit"

rdm_train = xgbr.RayDMatrix(train_files, label=target_col)
rdm_val = xgbr.RayDMatrix(val_files, label=target_col)
df_test = pd.read_csv(test_file)
rdm_test = xgbr.RayDMatrix(df_test, label=target_col)

## Initial Model Training with MLflow Tracking

In [None]:
with mlflow.start_run(run_name='initial_model') as run:
    param = {
        "seed": 1234,
        "max_depth": 3,
        "eta": 0.1,
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"]
    }
    
    mlflow.log_params(param)
    
    xgb_ray_params = xgbr.RayParams(
        num_actors=RAY_ACTORS,
        cpus_per_actor=RAY_CPUS_PER_ACTOR
    )
    
    evals_result = {}
    bst = xgbr.train(
        param,
        rdm_train,
        num_boost_round=50,
        verbose_eval=True,
        evals_result=evals_result,
        evals=[(rdm_train, "train"), (rdm_val, "val")],
        ray_params=xgb_ray_params
    )
    
    mlflow.log_metric("train_error", evals_result["train"]["error"][-1])
    mlflow.log_metric("val_error", evals_result["val"]["error"][-1])
    
    print(f"Final training error: {evals_result['train']['error'][-1]:.4f}")
    print(f"Final validation error: {evals_result['val']['error'][-1]:.4f}")

## Hyperparameter Tuning with MLflow

In [None]:
config = {
    "seed": 1234,
    "eta": tune.loguniform(3e-3, 3e-1),
    "max_depth": tune.randint(2, 6),
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"]
}

In [None]:
def my_trainer(config):
    with mlflow.start_run(nested=True) as run:
        mlflow.log_params(config)
        
        evals_result = {}
        bst = xgbr.train(
            params=config,
            dtrain=rdm_train,
            num_boost_round=50,
            evals_result=evals_result,
            evals=[(rdm_train, "train"), (rdm_val, "val")],
            ray_params=xgb_ray_params
        )
        
        mlflow.log_metric("train_error", evals_result["train"]["error"][-1])
        mlflow.log_metric("val_error", evals_result["val"]["error"][-1])
        
        bst.save_model("model.xgb")
        mlflow.log_artifact("model.xgb")

In [None]:
with mlflow.start_run(run_name='hyperparameter_tuning') as run:
    analysis = tune.run(
        my_trainer,
        config=config,
        resources_per_trial=xgb_ray_params.get_tune_resources(),
        local_dir=TUNE_ROOT,
        metric="val-error",
        mode="min",
        num_samples=10,
        verbose=1,
        progress_reporter=tune.JupyterNotebookReporter(overwrite=True)
    )
    
    mlflow.log_params({"best_" + k: v for k, v in analysis.best_config.items()})

## Final Model Evaluation with MLflow Tracking

In [None]:
with mlflow.start_run(run_name='final_model_evaluation') as run:
    bst = xgb.Booster(model_file=os.path.join(MODEL_ROOT, "tune_best.xgb"))
    mlflow.log_artifact(os.path.join(MODEL_ROOT, "tune_best.xgb"))
    
    xgb_ray_params = xgbr.RayParams(
        num_actors=RAY_ACTORS,
        cpus_per_actor=RAY_CPUS_PER_ACTOR
    )
    
    predictions = xgbr.predict(bst, rdm_test, ray_params=xgb_ray_params)
    pred_class = (predictions > 0.5).astype("int")
    actuals = df_test[target_col]
    
    accuracy = accuracy_score(pred_class, actuals)
    precision = precision_score(pred_class, actuals)
    recall = recall_score(pred_class, actuals)
    f1 = f1_score(pred_class, actuals)
    
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "test_f1": f1
    })
    
    # Log feature importance plot
    fig, ax = plt.subplots(figsize=(10, 6))
    xgb.plot_importance(bst, importance_type="gain", max_num_features=10, ax=ax)
    plt.title("Feature Importance (Gain)")
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()
    
    # Log model to MLflow model registry
    mlflow.xgboost.log_model(bst, "model")
    
    print(f"Accuracy on test: {accuracy:.2f}")
    print(f"Precision on test: {precision:.2f}")
    print(f"Recall on test: {recall:.2f}")
    print(f"F1 score on test: {f1:.2f}")