Install Domino Data SDK (for training set creation)

In [None]:
pip install --user dominodatalab-data

Import libraries

In [None]:
import mlflow
from mlflow.models import infer_signature

Load Data

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# loading the California housing dataset
cali_housing = fetch_california_housing(as_frame=True)

# split the dataset into train and test partitions
X_train, X_test, y_train, y_test = train_test_split(
    cali_housing.data, cali_housing.target, test_size=0.2, random_state=123
)

Create the training set

In [None]:
from domino_data.training_sets import client, model
import pandas as pd

training_df = pd.DataFrame(data = X_train, columns = cali_housing.feature_names)
training_df[cali_housing.target_names[0]] = y_train

tsv = client.create_training_set_version(
    training_set_name="cali_housing_regression",
    df=training_df,
    key_columns=[],
    target_columns=cali_housing.target_names,
    exclude_columns=[],
    meta={"experiment_id": "0.1"},
    monitoring_meta=model.MonitoringMeta(**{
        "categorical_columns": [],
        "timestamp_columns": [],
        "ordinal_columns": []
    })
)

print(f"TrainingSetVersion {tsv.training_set_name}:{tsv.number}")

Create Sklearn Model and wrap as a mlflow Python model

In [None]:
from sklearn.linear_model import LinearRegression
from domino_data_capture.data_capture_client import DataCaptureClient
import uuid
import datetime

# train the model
lin_reg = LinearRegression().fit(X_train, y_train)

# Infer model signature
predictions = lin_reg.predict(X_train)
signature = infer_signature(X_train, predictions)

data_capture_client = DataCaptureClient(cali_housing.feature_names, cali_housing.target_names)

class HousingModel(mlflow.pyfunc.PythonModel):
    def __init__(self,model):
        self.model = model
    
    # Assumes model_input is a list of lists
    def predict(self, context, model_input, params=None):
        event_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
        prediction = self.model.predict(model_input)
        
        if isinstance(model_input, pd.DataFrame):
            model_input = model_input.values.tolist()
        
        for i in range(len(prediction)):
            # Record eventID and current time
            event_id = uuid.uuid4()
            model_input_value = model_input[i]
            prediction_value = [prediction[i]]
            
            # Capture this prediction event so Domino can keep track
            data_capture_client.capturePrediction(model_input_value, prediction_value, event_id=event_id,
                                timestamp=event_time)

model = HousingModel(lin_reg)

Create the Pyfunc and log it to MLflow

In [None]:
with mlflow.start_run() as run:
    model_info = mlflow.sklearn.log_model(
        registered_model_name="sklearn-model", 
        python_model = model, 
        artifact_path="sklearn-model", 
        signature=signature
    )

print(model_info)