In [None]:
# Install required Python packages
%pip install -Uqq azure-ai-ml azure-identity azure-keyvault-secrets azureml-sdk requests pyyaml

In [None]:
import os
import sys

# Directory where you want to clone the repository
repo_dir = 'gretel-mlops'

# Check if the directory exists
if not os.path.exists(repo_dir):
    # Directory does not exist, clone the repository
    !git clone https://github.com/gretelai/gretel-mlops.git
else:
    print(f"The directory '{repo_dir}' already exists.")

# Import Gretel MLOps modules
gretel_mlops_path = os.getcwd() + "/gretel-mlops/src/"
if gretel_mlops_path not in sys.path:
    sys.path.append(gretel_mlops_path)

In [None]:
# Import necessary libraries
from azure.ai.ml import MLClient, dsl, Input, Output
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from azureml.core import Workspace
from azure.ai.ml.entities import Environment
import yaml
import json
from gretel_mlops.azure.azureai import pipeline

## 1. Connect to Azure ML Workspace and create ML client


In [None]:
# Load the workspace configuration from the default configuration file
ws = Workspace.from_config()

# Create an ML client using the workspace details
ml_client = pipeline.create_ml_client(ws.subscription_id, ws.resource_group, ws.name)

# Set a name for the environment to be used in the pipeline
pipeline_job_env_name = "gretel-mlops-pipeline"

# Define the path to the YAML configuration file
requirements_file = f'{gretel_mlops_path}/gretel_mlops/azure/azureai/requirements.yaml'

# Create an Azure ML environment object
# This environment specifies the dependencies and runtime context for the pipeline steps
# It includes a reference to a Conda environment file and a base Docker image
pipeline_job_env = Environment(
    name=pipeline_job_env_name,
    description="Environment for Gretel MLOps pipeline",
    conda_file=requirements_file,  # Path to Conda dependencies file
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",  # Base Docker image
)

# Register or update the environment in the Azure ML workspace
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

# Print out the details of the registered environment
print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)


## 2. Fetch and load Gretel MLOps configuration from a YAML file

The below code create or update a data asset from the config file in the Azure ML workspace.
It creates a datastore and will prompt for Azure Storage account details.

Note: Azure Storage account name and key required for datastore creation. These can be found in the Azure Portal under your storage account's "Access keys" section.

In [None]:
# Define the URL of the YAML configuration file
config_file = f"{gretel_mlops_path}/gretel_mlops/configs/config_stroke.yaml"

# Parse the YAML content of the response into a Python dictionary
with open(config_file, "r") as file:
    config_dict = yaml.safe_load(file)

# Generate a unique name for the configuration asset based on the dataset name
config_asset_name = f"pipeline-config-{config_dict['dataset']['name']}"

# Call the function to create or update the asset in Azure ML workspace
config_asset = pipeline.create_asset_from_config(
    ml_client, ws, config_dict, config_asset_name
)

print(f"Config asset path: {config_asset.path}")

print("Config file:")
yaml.dump(config_dict, sys.stdout, default_flow_style=False, sort_keys=False)

## 3. Build the pipeline


In [None]:
# Define the local src dir to pipeline components
src_dir = "gretel-mlops/src/gretel_mlops/azure/azureai/components/"

# Define the components that will be used in the pipeline.
components = pipeline.define_pipeline_components(
    subscription_id=ws.subscription_id, 
    resource_group=ws.resource_group, 
    workspace_name=ws.name,
    pipeline_job_env_name=pipeline_job_env_name, 
    pipeline_job_env_version=pipeline_job_env.version,
    src_dir=src_dir
)

In [None]:
# Define the Gretel pipeline
@dsl.pipeline(
    compute="serverless",
    description="Gretel MLOps pipeline",
)
def gretel_pipeline(pipeline_job_config, gretel_api_key, pipeline_job_registered_model_name):
    """
    Define and configure the Gretel pipeline.

    This function constructs a pipeline using predefined components, sets up the necessary configurations, 
    and links these components together.

    Args:
        pipeline_job_config (str): Configuration for the pipeline job, defining the execution parameters and settings.
        gretel_api_key (str): The API key for authenticating with the Gretel services.
        pipeline_job_registered_model_name (str): The registered name of the model used in the pipeline.

    Returns:
        dict: A dictionary containing the output directories for each step of the pipeline, facilitating access to results and logs.
    """


    # Define the preprocessing step of the pipeline
    preprocess_op = components["preprocess"](config=pipeline_job_config)

    # Define the Gretel synthetic data generation step
    gretel_op = components["gretel"](
        input_dir=preprocess_op.outputs.output_dir,
        gretel_api_key=gretel_api_key,
        config=pipeline_job_config
    )

    # Define the training step of the pipeline
    train_op = components["train"](
        input_dir=preprocess_op.outputs.output_dir, 
        gretel_dir=gretel_op.outputs.output_dir,
        config=pipeline_job_config
    )

    # Define the evaluation step of the pipeline
    eval_op = components["evaluate"](
        input_dir=preprocess_op.outputs.output_dir, 
        model_dir=train_op.outputs.output_dir,
        config=pipeline_job_config
    )

    # Define the model registration step of the pipeline
    register_op = components["register"](
        eval_dir=eval_op.outputs.output_dir, 
        model_dir=train_op.outputs.output_dir, 
        model_display_name=pipeline_job_registered_model_name,
        config=pipeline_job_config
    )

    # Return a dictionary mapping the names of the pipeline steps to their respective output directories
    # These outputs can be used as inputs to subsequent steps or for analysis after the pipeline completes
    return {
        "pipeline_job_preprocess_outputs": preprocess_op.outputs.output_dir,
        "pipeline_job_gretel_outputs": gretel_op.outputs.output_dir,
        "pipeline_job_train_outputs": train_op.outputs.output_dir,
        "pipeline_job_eval_outputs": eval_op.outputs.output_dir,
        "pipeline_job_register_outputs": register_op.outputs.output_dir,
    }

## 4. Submit the pipeline job

In [None]:
# Retrieve Gretel API Key from the Azure Key Vault and secret name where the Gretel API Key is stored
gretel_key_vault_name = "GretelVault"
gretel_secret_name = "GretelApiKey"
gretel_api_key = pipeline.get_secret(gretel_secret_name, gretel_key_vault_name)

In [None]:
# Construct a unique name for the model based on the dataset name from the config
registered_model_name = f"gretel-model-{config_dict['dataset']['name']}"

# Construct a unique name for the experiment under which this pipeline run will be recorded
experiment_name = f"gretel-model-{config_dict['dataset']['name']}-new2"

# Create and submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    job=gretel_pipeline(
        pipeline_job_config=Input(type="uri_file", path=config_asset.path),
        gretel_api_key=gretel_api_key,
        pipeline_job_registered_model_name=registered_model_name
    ),
    experiment_name=experiment_name,
)

# Stream the logs of the pipeline job
ml_client.jobs.stream(pipeline_job.name)

## 5. Inspect Evaluation Report

In [None]:
# Download the output of the pipeline job

eval_outputs = "pipeline_job_eval_outputs"

ml_client.jobs.download(
    name=pipeline_job.name,
    download_path="./",
    output_name=eval_outputs
)

evaluation_report_file = f"named-outputs/{eval_outputs}/evaluation.json"

with open(evaluation_report_file, "r") as json_file:
    evaluation_report = json.load(json_file)

# Print the JSON data nicely with indentation
print(json.dumps(evaluation_report, indent=4))