# Model Training with MLflow Integration

This notebook extends the original credit risk prediction model with MLflow tracking capabilities.

In [2]:
import os
import ray
import glob
import eli5
import mlflow
import xgboost_ray as xgbr
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error
from ray import tune

# Set up MLflow tracking URI - replace with your MLflow server URL
#mlflow.set_tracking_uri('http://your-mlflow-server:5000')
#mlflow.set_experiment('credit-risk-prediction')

# Enable MLflow autologging for XGBoost
mlflow.xgboost.autolog()

DATA_ROOT = os.path.join("/mnt/data", os.environ["DOMINO_PROJECT_NAME"], "data") 
MODEL_ROOT = "/mnt/artifacts"
TUNE_ROOT = os.path.join("/mnt/data", os.environ["DOMINO_PROJECT_NAME"], "ray_results")

In [3]:
# Ray setup
RAY_ACTORS = 3
RAY_CPUS_PER_ACTOR = 4

if ray.is_initialized() == False:
    service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
    service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
    ray.init(f"ray://{service_host}:{service_port}")

In [4]:
train_files = glob.glob(os.path.join(DATA_ROOT, "train_data*"))
val_files = glob.glob(os.path.join(DATA_ROOT, "validation_data*"))
test_file = os.path.join(DATA_ROOT, "test_data.csv")
target_col = "credit"

rdm_train = xgbr.RayDMatrix(train_files, label=target_col)
rdm_val = xgbr.RayDMatrix(val_files, label=target_col)
df_test = pd.read_csv(test_file)
rdm_test = xgbr.RayDMatrix(df_test, label=target_col)

## Initial Model Training with MLflow Tracking

In [5]:
with mlflow.start_run(run_name='initial_model') as run:
    param = {
        "seed": 1234,
        "max_depth": 3,
        "eta": 0.1,
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"]
    }
    
    mlflow.log_params(param)
    
    xgb_ray_params = xgbr.RayParams(
        num_actors=RAY_ACTORS,
        cpus_per_actor=RAY_CPUS_PER_ACTOR
    )
    
    evals_result = {}
    bst = xgbr.train(
        param,
        rdm_train,
        num_boost_round=50,
        verbose_eval=True,
        evals_result=evals_result,
        evals=[(rdm_train, "train"), (rdm_val, "val")],
        ray_params=xgb_ray_params
    )
    
    mlflow.log_metric("train_error", evals_result["train"]["error"][-1])
    mlflow.log_metric("val_error", evals_result["val"]["error"][-1])
    
    print(f"Final training error: {evals_result['train']['error'][-1]:.4f}")
    print(f"Final validation error: {evals_result['val']['error'][-1]:.4f}")

Use get_node_id() instead
  current_node_id = ray.get_runtime_context().node_id.hex()
[2m[36m(_wrapped pid=458)[0m 2025-01-22 12:57:10,401	INFO main.py:1047 -- [RayXGBoost] Created 3 new actors (3 total actors). Waiting until actors are ready for training.
[2m[36m(_wrapped pid=458)[0m 2025-01-22 12:57:21,182	INFO main.py:1092 -- [RayXGBoost] Starting XGBoost training.
[2m[36m(_RemoteRayXGBoostActor pid=173, ip=100.64.57.144)[0m [12:57:21] task [xgboost.ray]:124137787360064 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=210, ip=100.64.68.246)[0m [12:57:21] task [xgboost.ray]:128020571211280 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=172, ip=100.64.73.105)[0m [12:57:21] task [xgboost.ray]:134138783786944 got new rank 2


[2m[36m(_wrapped pid=458)[0m [0]	train-logloss:0.65890	train-error:0.22747	val-logloss:0.65631	val-error:0.20883
[2m[36m(_wrapped pid=458)[0m [1]	train-logloss:0.63109	train-error:0.22331	val-logloss:0.62514	val-error:0.20021
[2m[36m(_wrapped pid=458)[0m [2]	train-logloss:0.60870	train-error:0.22340	val-logloss:0.60081	val-error:0.20959
[2m[36m(_wrapped pid=458)[0m [3]	train-logloss:0.58690	train-error:0.18629	val-logloss:0.57680	val-error:0.15792
[2m[36m(_wrapped pid=458)[0m [4]	train-logloss:0.56898	train-error:0.19034	val-logloss:0.55650	val-error:0.16499
[2m[36m(_wrapped pid=458)[0m [5]	train-logloss:0.55210	train-error:0.20139	val-logloss:0.53913	val-error:0.17972
[2m[36m(_wrapped pid=458)[0m [6]	train-logloss:0.53646	train-error:0.18746	val-logloss:0.52258	val-error:0.16193
[2m[36m(_wrapped pid=458)[0m [7]	train-logloss:0.52353	train-error:0.18899	val-logloss:0.50922	val-error:0.16659
[2m[36m(_wrapped pid=458)[0m [8]	train-logloss:0.51097	train-error:0.

[2m[36m(_wrapped pid=458)[0m 2025-01-22 12:57:51,678	INFO main.py:1175 -- Training in progress (30 seconds since last restart).


[2m[36m(_wrapped pid=458)[0m [33]	train-logloss:0.34684	train-error:0.11448	val-logloss:0.33241	val-error:0.10723
[2m[36m(_wrapped pid=458)[0m [34]	train-logloss:0.34377	train-error:0.11517	val-logloss:0.32920	val-error:0.10568
[2m[36m(_wrapped pid=458)[0m [35]	train-logloss:0.33971	train-error:0.11360	val-logloss:0.32577	val-error:0.10412
[2m[36m(_wrapped pid=458)[0m [36]	train-logloss:0.33605	train-error:0.10761	val-logloss:0.32224	val-error:0.10097
[2m[36m(_wrapped pid=458)[0m [37]	train-logloss:0.33335	train-error:0.10382	val-logloss:0.31979	val-error:0.09627
[2m[36m(_wrapped pid=458)[0m [38]	train-logloss:0.32945	train-error:0.10460	val-logloss:0.31619	val-error:0.09786
[2m[36m(_wrapped pid=458)[0m [39]	train-logloss:0.32654	train-error:0.10505	val-logloss:0.31294	val-error:0.09787
[2m[36m(_wrapped pid=458)[0m [40]	train-logloss:0.32362	train-error:0.10278	val-logloss:0.31013	val-error:0.09549
[2m[36m(_wrapped pid=458)[0m [41]	train-logloss:0.32094	train

[2m[36m(_wrapped pid=458)[0m 2025-01-22 12:58:04,726	INFO main.py:1587 -- [RayXGBoost] Finished XGBoost training on training data with total N=2,100,000 in 55.32 seconds (43.53 pure XGBoost training time).


Final training error: 0.0941
Final validation error: 0.0876


## Hyperparameter Tuning with MLflow

In [6]:
config = {
    "seed": 1234,
    "eta": tune.loguniform(3e-3, 3e-1),
    "max_depth": tune.randint(2, 6),
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"]
}

In [7]:
def my_trainer(config):
    with mlflow.start_run(nested=True) as run:
        mlflow.log_params(config)
        
        evals_result = {}
        bst = xgbr.train(
            params=config,
            dtrain=rdm_train,
            num_boost_round=50,
            evals_result=evals_result,
            evals=[(rdm_train, "train"), (rdm_val, "val")],
            ray_params=xgb_ray_params
        )
        
        mlflow.log_metric("train_error", evals_result["train"]["error"][-1])
        mlflow.log_metric("val_error", evals_result["val"]["error"][-1])
        
        bst.save_model("model.xgb")
        mlflow.log_artifact("model.xgb")

In [8]:
with mlflow.start_run(run_name='hyperparameter_tuning') as run:
    analysis = tune.run(
        my_trainer,
        config=config,
        resources_per_trial=xgb_ray_params.get_tune_resources(),
        local_dir=TUNE_ROOT,
        metric="val-error",
        mode="min",
        num_samples=10,
        verbose=1,
        progress_reporter=tune.JupyterNotebookReporter(overwrite=True)
    )
    
    mlflow.log_params({"best_" + k: v for k, v in analysis.best_config.items()})

0,1
Current time:,2025-01-22 13:00:28
Running for:,00:00:46.06
Memory:,3.4/30.8 GiB

Trial name,# failures,error file
my_trainer_ca21d_00000,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00000_0_eta=0.1976,max_depth=2_2025-01-22_12-59-42/error.txt"
my_trainer_ca21d_00001,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00001_1_eta=0.0110,max_depth=2_2025-01-22_12-59-49/error.txt"
my_trainer_ca21d_00002,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00002_2_eta=0.0176,max_depth=4_2025-01-22_12-59-55/error.txt"
my_trainer_ca21d_00003,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00003_3_eta=0.2888,max_depth=5_2025-01-22_13-00-01/error.txt"
my_trainer_ca21d_00004,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00004_4_eta=0.1330,max_depth=4_2025-01-22_13-00-05/error.txt"
my_trainer_ca21d_00005,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00005_5_eta=0.0394,max_depth=4_2025-01-22_13-00-09/error.txt"
my_trainer_ca21d_00006,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00006_6_eta=0.0966,max_depth=4_2025-01-22_13-00-13/error.txt"
my_trainer_ca21d_00007,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00007_7_eta=0.2250,max_depth=5_2025-01-22_13-00-17/error.txt"
my_trainer_ca21d_00008,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00008_8_eta=0.0460,max_depth=3_2025-01-22_13-00-21/error.txt"
my_trainer_ca21d_00009,1,"/mnt/data/Demo-Credit-Default-Model/ray_results/my_trainer_2025-01-22_12-59-37/my_trainer_ca21d_00009_9_eta=0.0049,max_depth=2_2025-01-22_13-00-25/error.txt"

Trial name,status,loc,eta,max_depth
my_trainer_ca21d_00000,ERROR,100.64.73.105:252,0.19758,2
my_trainer_ca21d_00001,ERROR,100.64.68.246:293,0.0109623,2
my_trainer_ca21d_00002,ERROR,100.64.57.144:255,0.0176089,4
my_trainer_ca21d_00003,ERROR,100.64.73.105:351,0.288773,5
my_trainer_ca21d_00004,ERROR,100.64.73.105:450,0.133049,4
my_trainer_ca21d_00005,ERROR,100.64.38.234:1388,0.0394136,4
my_trainer_ca21d_00006,ERROR,100.64.38.234:1551,0.0966059,4
my_trainer_ca21d_00007,ERROR,100.64.73.105:549,0.224958,5
my_trainer_ca21d_00008,ERROR,100.64.68.246:392,0.0460114,3
my_trainer_ca21d_00009,ERROR,100.64.73.105:648,0.00488958,2


[2m[36m(run pid=804)[0m 2025-01-22 12:59:48,305	ERROR trial_runner.py:1450 -- Trial my_trainer_ca21d_00000: Error happened when processing _ExecutorEventType.TRAINING_RESULT.
[2m[36m(run pid=804)[0m ray.exceptions.RayTaskError(MlflowException): [36mray::ImplicitFunc.train()[39m (pid=252, ip=100.64.73.105, repr=my_trainer)
[2m[36m(run pid=804)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 384, in train
[2m[36m(run pid=804)[0m     raise skipped from exception_cause(skipped)
[2m[36m(run pid=804)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 336, in entrypoint
[2m[36m(run pid=804)[0m     return self._trainable_func(
[2m[36m(run pid=804)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 653, in _trainable_func
[2m[36m(run pid=804)[0m     output = fn()
[2m[36m(run pid=804)[0m   File "/tmp/ipykern

type: [36mray::run()[39m (pid=804, ip=100.64.38.234)
  File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 939, in run
    raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [my_trainer_ca21d_00000, my_trainer_ca21d_00001, my_trainer_ca21d_00002, my_trainer_ca21d_00003, my_trainer_ca21d_00004, my_trainer_ca21d_00005, my_trainer_ca21d_00006, my_trainer_ca21d_00007, my_trainer_ca21d_00008, my_trainer_ca21d_00009])

## Final Model Evaluation with MLflow Tracking

In [None]:
with mlflow.start_run(run_name='final_model_evaluation') as run:
    bst = xgb.Booster(model_file=os.path.join(MODEL_ROOT, "tune_best.xgb"))
    mlflow.log_artifact(os.path.join(MODEL_ROOT, "tune_best.xgb"))
    
    xgb_ray_params = xgbr.RayParams(
        num_actors=RAY_ACTORS,
        cpus_per_actor=RAY_CPUS_PER_ACTOR
    )
    
    predictions = xgbr.predict(bst, rdm_test, ray_params=xgb_ray_params)
    pred_class = (predictions > 0.5).astype("int")
    actuals = df_test[target_col]
    
    accuracy = accuracy_score(pred_class, actuals)
    precision = precision_score(pred_class, actuals)
    recall = recall_score(pred_class, actuals)
    f1 = f1_score(pred_class, actuals)
    
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "test_f1": f1
    })
    
    # Log feature importance plot
    fig, ax = plt.subplots(figsize=(10, 6))
    xgb.plot_importance(bst, importance_type="gain", max_num_features=10, ax=ax)
    plt.title("Feature Importance (Gain)")
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()
    
    # Log model to MLflow model registry
    mlflow.xgboost.log_model(bst, "model")
    
    print(f"Accuracy on test: {accuracy:.2f}")
    print(f"Precision on test: {precision:.2f}")
    print(f"Recall on test: {recall:.2f}")
    print(f"F1 score on test: {f1:.2f}")