In [1]:
import azureml.core

from azureml.core import Experiment, Workspace, Dataset, Datastore
from azureml.train.automl import AutoMLConfig
from notebookutils import mssparkutils
from azureml.data.dataset_factory import TabularDatasetFactory

StatementMeta(sparkpool01, 7, 1, Finished, Available)

In [2]:
linkedService_name = "AzureMLService2"
experiment_name = "cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203"

ws = mssparkutils.azureML.getWorkspace(linkedService_name)
experiment = Experiment(ws, experiment_name)

StatementMeta(sparkpool01, 7, 2, Finished, Available)

In [3]:
df = spark.sql("SELECT * FROM default.aw_forecast_product_sales")

datastore = Datastore.get_default(ws)
dataset = TabularDatasetFactory.register_spark_dataframe(df, datastore, name = experiment_name + "-dataset")

StatementMeta(sparkpool01, 7, 3, Finished, Available)

Method register_spark_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Validating arguments.
Arguments validated.
Writing spark dataframe to managed-dataset/bfdd7655-f299-4908-8620-4efcc06fb33e
Creating new dataset
Registering new dataset
Successfully created and registered a new dataset.

In [4]:
automl_config = AutoMLConfig(spark_context = sc,
                             task = "regression",
                             training_data = dataset,
                             label_column_name = "TotalOrderQty",
                             primary_metric = "spearman_correlation",
                             experiment_timeout_hours = 0.25,
                             max_concurrent_iterations = 2,
                             enable_onnx_compatible_models = True)

StatementMeta(sparkpool01, 7, 4, Finished, Available)

In [5]:
run = experiment.submit(automl_config)

StatementMeta(sparkpool01, 7, 5, Finished, Available)

Submitting spark run.

In [6]:
displayHTML("<a href={} target='_blank'>Your experiment in Azure Machine Learning portal: {}</a>".format(run.get_portal_url(), run.id))

StatementMeta(sparkpool01, 7, 6, Finished, Available)

In [7]:
run.wait_for_completion()

import onnxruntime
import mlflow
import mlflow.onnx

from mlflow.models.signature import ModelSignature
from mlflow.types import DataType
from mlflow.types.schema import ColSpec, Schema

# Get best model from automl run
best_run, onnx_model = run.get_output(return_onnx_model=True)

# Define utility functions to infer the schema of ONNX model
def _infer_schema(data):
    res = []
    for _, col in enumerate(data):
        t = col.type.replace("tensor(", "").replace(")", "")
        if t in ["bool"]:
            dt = DataType.boolean
        elif t in ["int8", "uint8", "int16", "uint16", "int32"]:
            dt = DateType.integer
        elif t in ["uint32", "int64"]:
            dt = DataType.long
        elif t in ["float16", "bfloat16", "float"]:
            dt = DataType.float
        elif t in ["double"]:
            dt = DataType.double
        elif t in ["string"]:
            dt = DataType.string
        else:
            raise Exception("Unsupported type: " + t)
        res.append(ColSpec(type=dt, name=col.name))
    return Schema(res)

def _infer_signature(onnx_model):
    onnx_model_bytes = onnx_model.SerializeToString()
    onnx_runtime = onnxruntime.InferenceSession(onnx_model_bytes)
    inputs = _infer_schema(onnx_runtime.get_inputs())
    outputs = _infer_schema(onnx_runtime.get_outputs())
    return ModelSignature(inputs, outputs)

# Infer signature of ONNX model
signature = _infer_signature(onnx_model)

artifact_path = experiment_name + "_artifact"
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    # Save the model to the outputs directory for capture
    mlflow.onnx.log_model(onnx_model, artifact_path, signature=signature)

    # Register the model to AML model registry
    mlflow.register_model("runs:/" + run.info.run_id + "/" + artifact_path, "cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203-Best")

StatementMeta(sparkpool01, 7, 7, Finished, Available)

Received unrecognized parameter dataset_id
Received unrecognized parameter dataset_id
Received unrecognized parameter dataset_id
Successfully registered model 'cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203-Best'.
2022/04/11 18:51:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203-Best, version 1
Created version '1' of model 'cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203-Best'.

<ModelVersion: creation_timestamp=1649703105662, current_stage='None', description='', last_updated_timestamp=1649703105662, name='cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203-Best', run_id='fac0e7be-11bf-40df-a7b0-e39acfbf4858', run_link='', source='azureml://experiments/cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203/runs/fac0e7be-11bf-40df-a7b0-e39acfbf4858/artifacts/cs-eur-research-dp-synw-aw_forecast_product_sales-20220411063203_artifact', status='READY', status_message='', tags={}, user_id='', version='1'>