# Azure ML pipeline template

## Import libraries

In [2]:
import numpy as np
import pandas as pd

import sys, os
import mlflow  # install
import mlflow.azureml
import azureml.core  # install
from azureml.core import Workspace
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Experiment
from azureml.core import ComputeTarget

from azureml.core import ScriptRunConfig
import yaml
from azureml.core import Dataset

from azureml.core import Datastore
from distutils.dir_util import copy_tree

## Define workspace

In [None]:
# Define Workspace - Info in .azureml/config.json file

ws = Workspace.from_config()

print("SDK version:", azureml.core.VERSION)
print("MLflow version:", mlflow.version.VERSION)
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep="\n")

mlflow.set_tracking_uri(
    ws.get_mlflow_tracking_uri()
)  # integration of  MLFlow and AzureML - You need to install azureml-mlflow

## Define Experiment Name

In [None]:
experiment_name = "azureml-pipeline"
mlflow.set_experiment(experiment_name)  # set an experiment name
exp = Experiment(ws, experiment_name)  # create experiment


## Creating Pipeline

In [5]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core import Dataset

from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep
from azureml.pipeline.core import (
    Pipeline,
    PipelineData,
    TrainingOutput,
    PipelineParameter,
)
from azureml.data.output_dataset_config import OutputFileDatasetConfig
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

## Define Data Path from input

In [None]:
from azureml.data.datapath import DataPath, DataPathComputeBinding, DataReference

# example
path_on_datastore = "hr_dataset"
data_path = DataPath(
    datastore=ws.get_default_datastore(), path_on_datastore=path_on_datastore
)
data_path_param = PipelineParameter(name="representation", default_value=data_path)
datapath_input = (data_path_param, DataPathComputeBinding(mode="mount"))

### PythonScriptStep

In [None]:
from azureml.core.runconfig import RunConfiguration

base_image_name = "sleepiz.azurecr.io/sleepiz-sag-dl-base:V1.1.3"  # Define Docker image to use for each (multiple) steps

run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = base_image_name
run_config.environment.python.user_managed_dependencies = True

In [None]:
# dataset = Dataset.get_by_name(ws, name='hr_dataset',version =1) # dataset to use

augmented_data = OutputFileDatasetConfig(
    name="output_path", destination=(ws.get_default_datastore, "results")
)  # reference to directory used between steps

augmented_step = PythonScriptStep(
    name="Name step",
    script_name="step_script.py",  # file to use for preprocessing
    source_directory="script_folder",  # keep files in different directories to re-use steps that have not changed
    compute_target=  ComputeTarget(ws, "sag-cpu"),  # target
    arguments=[
        "--output_path",
        augmented_data,
        "--input-data",
        datapath_input,
    ],  # where to save data
    inputs=[datapath_input],  # load the dataset as input directory
    allow_reuse=True,  # reuse if not changed occurred
    runconfig=run_config,  # configuration (akin to Environment)
)

### Hyperdrive Step
1. define Paramter sampling strategy
2. optimizing metric and direction
3. Hyperdrive configuration

#### Training source

In [None]:
base_image_name = (
    "sleepiz.azurecr.io/sleepiz-sag-dl-base:V1.1.3"  # Define Docker image to use
)

env_training = Environment(name="sleepiz")
env_training.docker.enabled = True
env_training.docker.base_image = base_image_name
env_training.python.user_managed_dependencies = True
# new_env.python.interpreter_path = '/root/miniconda3/envs/deep_breath/bin/python' # remove env from docker img
# new_env.python.conda_dependencies.add_pip_package('onnxruntime-gpu')
env_training.environment_variables = {"AZUREML_COMPUTE_USE_COMMON_RUNTIME": "false"}

training_data = augmented_data.as_input(
    "input_path"
)  # the output of the previous step become input of the following step

# define training script and environment
src = ScriptRunConfig(
    source_directory="training/",
    script="train.py",
    arguments=["--input-data", training_data],
    environment=env_training,
    compute_target=ComputeTarget(ws, name="sag-gpu"),
)


#### Hyperparameters tuning

In [8]:
from azureml.train.hyperdrive import (
    BayesianParameterSampling,
    choice,
    PrimaryMetricGoal,
    HyperDriveRun,
    HyperDriveConfig,
)

In [None]:
param_sampling = BayesianParameterSampling(
    {
        "--dense-units": choice(5, 10, 20),
        "--n-epochs": choice(2),
    }
)

primary_metric_name = "prc"  # name to monitor (ideally should be validation_prc)
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE  # direction of optimization

max_total_runs = 2  # total runs for hp tuning

hyperdrive_config = HyperDriveConfig(
    run_config=src,  # ScriptRunConfig file with training script
    hyperparameter_sampling=param_sampling,  # parameters space
    policy=None,
    primary_metric_name=primary_metric_name,
    primary_metric_goal=primary_metric_goal,
    max_total_runs=max_total_runs,
    max_concurrent_runs=2,
    # must not be high, othewise Bayesian sampling can't take info from the past
    # start from last iteration of the parent (need same HP space),
    # esume_child_runs =
)


# Save metric to blob storage to automate saving of the best model - Not working at the moment
metrics_output_name = "metrics_output"
metrics_output = PipelineData("metrics_output", datastore=ws.get_default_datastore())

# Save models to blob storage to automate saving of the best model found
model_output_name = "model_output"
saved_model = PipelineData(
    name="saved_model",
    datastore=ws.get_default_datastore(),
    pipeline_output_name=model_output_name,
    training_output=TrainingOutput(
        "Model", metric="prc", model_file="./outputs/model/model.h5"
    ),
)

# Hyperdrive Step
hd_step = HyperDriveStep(
    name="HP tuning",
    hyperdrive_config=hyperdrive_config,
    inputs=[training_data],  # input
    outputs=[saved_model],  # output
    metrics_output=metrics_output,
    allow_reuse=True,
)

### Running pipeline

In [None]:
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep
from azureml.pipeline.core import (
    Pipeline,
    PipelineData,
    TrainingOutput,
    PipelineParameter,
    PipelineRun,
)
from azureml.data.output_dataset_config import OutputFileDatasetConfig
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig


pipeline = Pipeline(
    ws, [augmented_step, hd_step]
)  # Create pipeline

pipeline.validate()  # validate pipeline

pipeline_run = exp.submit(pipeline, show_output=True)  # Submit pipeline

For more details see the WIKI [page]