# Data Science / Data Wrangling

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert the data into DMatrix format (a specific data structure used by XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

import mlflow
import mlflow.xgboost

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("my-only-experiment")

with mlflow.start_run() as run:
    # Set the hyperparameters for the XGBoost model
    params = {
        "objective": "multi:softmax",  # Classification task with multiple classes
        "num_class": 3,  # Number of classes in the target variable
        "max_depth": 10,  # Maximum depth of each tree
        "eta": 0.1,  # Learning rate (step size shrinkage)
        "subsample": 0.8,  # Subsample ratio of the training instances
        "colsample_bytree": 0.8,  # Subsample ratio of columns when constructing each tree
        "eval_metric": "merror",  # Evaluation metric for multi-class classification (classification error)
    }

    mlflow.log_params(params)

    # Train the XGBoost model
    num_rounds = 100  # Number of boosting rounds (iterations)
    model = xgb.train(params, dtrain, num_rounds)

    mlflow.xgboost.log_model(model, "my_xgboost")

    # Make predictions on the test set
    y_pred = model.predict(dtest)

    # Convert the predicted labels to integers (as XGBoost returns float predictions)
    y_pred = y_pred.astype(int)

    # Calculate and print the accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    mlflow.log_metric("accuracy", accuracy)

2023/08/07 08:24:35 INFO mlflow.tracking.fluent: Experiment with name 'my-only-experiment' does not exist. Creating a new experiment.


Accuracy: 1.0



mlflow.register_model(
    f"runs:/{run.info.run_id}/my_xgboost", "XGBOOSTMODEL"
)

In [21]:
from mlflow import MlflowClient 

client = MlflowClient()
client.transition_model_version_stage(
    name="XGBOOSTMODEL", version=1, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1691396981590, current_stage='Production', description='', last_updated_timestamp=1691398767053, name='XGBOOSTMODEL', run_id='e52d2743f8a64d75ab50d96cb63ba400', run_link='', source='mlflow-artifacts:/334997387425955836/e52d2743f8a64d75ab50d96cb63ba400/artifacts/my_xgboost', status='READY', status_message='', tags={}, user_id='', version='1'>

# Now you can check the model on mlflow ui
http://127.0.0.1:5000/#/models


# CI/CD

In [11]:
# Model Registered on the UI

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_registry_uri("http://localhost:5000")


reloaded_model = mlflow.xgboost.load_model("models:/XGBOOSTMODEL/1")

y_pred = model.predict(dtest)
y_pred

array([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2.,
       1., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.], dtype=float32)

In [19]:
# We've put v1 to Production

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_registry_uri("http://localhost:5000")


reloaded_model = mlflow.xgboost.load_model("models:/XGBOOSTMODEL/production")

y_pred = reloaded_model.predict(dtest)
reloaded_model

# next: Build release artifact (docker)

<xgboost.core.Booster at 0x7fcc47ca9480>

In [13]:
dtest

<xgboost.core.DMatrix at 0x7fcc75f24340>

In [14]:
reloaded_model_pyfunc = mlflow.pyfunc.load_model("models:/XGBOOSTMODEL/production")
reloaded_model_pyfunc

mlflow.pyfunc.loaded_model:
  artifact_path: my_xgboost
  flavor: mlflow.xgboost
  run_id: e52d2743f8a64d75ab50d96cb63ba400

In [15]:
# Now this eats a numpy array / pandas
reloaded_model_pyfunc.predict(X_test)

array([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2.,
       1., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.], dtype=float32)

In [16]:
# Start predition service:
MLFLOW_TRACKING_URI=http://localhost:5000 mlflow models serve -p 5001 -m models:/XGBOOSTMODEL/production --no-conda

SyntaxError: invalid syntax (3142634073.py, line 2)

In [33]:
!curl \
  -H "content-type: application/json" \
  localhost:5001/invocations \
  -d '{"inputs": [[1, 2, 3, 4], [1, 2, 3, 4]]}'

{"predictions": [2.0, 2.0]}