## MLflow Integration with Hummingbird

In this notebook, we will load, train, fit, and predict on an Scikit Learn model. Then, we will use Hummingbird to convert the model into a Pytorch model and predict on that.

In [1]:
# pip install hummingbird_ml[extra]

In [2]:
# Borrowed this from DataBricks: https://docs.databricks.com/_static/notebooks/mlflow/mlflow-quick-start-python.html

import mlflow
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt

from numpy import savetxt
 
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from hummingbird.ml import convert

db = load_diabetes()
X = db.data
y = db.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Enable autolog()

# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.
mlflow.sklearn.autolog()
# mlflow.pytorch.autolog()
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.  
with mlflow.start_run():
  # Set the model parameters. 

  n_estimators = 100
  max_depth = 6
  max_features = 3

  # Create and train model.
  rf_model = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
  rf_model.fit(X_train, y_train)

  # Use the model to make predictions on the test dataset.
  rf_pred = rf_model.predict(X_test)

  # Convert sklearn model to pytorch
  torch_model = convert(rf_model, 'torch')
  
  pred_cpu_hb = torch_model.predict(X_test)



In [3]:
import mlflow
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt

from numpy import savetxt
 
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from hummingbird.ml import convert

db = load_diabetes()
X = db.data
y = db.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Set the model parameters. 

n_estimators = 100
max_depth = 6
max_features = 3

  # Create and train model.
rf_model = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
rf_model.fit(X_train, y_train)



2022/06/01 11:35:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '31d688a80cc843388d6161cf1bf9b745', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [4]:
%%timeit
rf_pred = rf_model.predict(X_test)

33.3 ms ± 8.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
  # Convert sklearn model to pytorch
torch_model = convert(rf_model, 'torch')



In [6]:
%%timeit  
pred_cpu_hb = torch_model.predict(X_test)

626 µs ± 93.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
