In this notebook, we will make use of AzureML storage and compute resources for the training process.

In [None]:
from pathlib import Path
import pandas as pd
import tempfile
from joblib import dump

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import mlflow
from azureml.core import Dataset, Workspace, Experiment, Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

In [None]:
from templates.experiment_tracking.train_baseline import create_decision_boundary_figure

In [None]:
DATA_DIR = Path("data/")

Create a dataset in AzureML

In [None]:
# Read the data from the file
df_train = pd.read_csv(DATA_DIR/'train.csv')
df_test = pd.read_csv(DATA_DIR/'test.csv')

In [None]:
# Get workspace from config.
workspace = Workspace.from_config()
datastore = workspace.get_default_datastore()

In [None]:
workspace.datasets

In [None]:
dataset_name = "moon-model"

In [None]:
with tempfile.TemporaryDirectory() as tmp_dir:
    # We can only upload from disk, so we save it to a temporary directory.
    df_train.to_csv(f"{tmp_dir}/train.csv", index=False)
    df_test.to_csv(f"{tmp_dir}/test.csv", index=False)

    # Upload the datasets to the default datastore (blobstore container)
    # In this container it puts it inside the "datasets/moons" folder
    blob_store_path = "datasets/moons"
    Dataset.File.upload_directory(tmp_dir, (datastore, blob_store_path))

    dataset = Dataset.File.from_files(path=[(datastore, blob_store_path)])
    dataset.register(workspace, dataset_name)

In [None]:
dataset = Dataset.get_by_name(workspace=workspace, name=dataset_name)
dataset.download(DATA_DIR)

In [None]:
azure_ws_mlflow_tracking_uri = workspace.get_mlflow_tracking_uri()
mlflow.set_tracking_uri(azure_ws_mlflow_tracking_uri)
mlflow.get_tracking_uri()

In [None]:
experiment_name = "dev-train"

In [None]:
# We will use the same Experiment space that we used earlier.
experiment = Experiment(workspace, experiment_name)
mlflow.set_experiment(experiment_name)

Using compute engine for training the model. 

For this, we need:
- An AzureML compute intance (has already been created for you).
- A custom python environment for your compute instance.
- Scripts containing the training process.

In [None]:
compute_name = "azureml-mlflow-dev"

In [None]:
# Get your compute machine
ComputeTarget(workspace=workspace, name=compute_name)

In [None]:
requirements_path = "../../requirements.txt"
environment_name = "dev-environment"

In [None]:
# Create a new custom environment
env = Environment.from_pip_requirements(
        name=environment_name,
        file_path=str(requirements_path),
)
# Python version must be added in this unclear way
env.python.conda_dependencies.set_python_version("3.8")


We need to run three scripts:
1. Download the dataset to our compute instace.
2. Run the training with MLflow.
3. Register the model to AzureML models.

First, we will create a script to download the dataset.

In [None]:
%%writefile download_dataset.py
from argparse import ArgumentParser
from azureml.core import Workspace, Dataset
parser = ArgumentParser()
parser.add_argument(
    '--dataset_name', 
    type=str, 
    default='moon_dataset', 
    help='The name of the dataset to download. The dataset must be registered in AzureML.')
parser.add_argument(
    '--output_folder', 
    type=str, 
    default='data', 
    help='The folder to save the dataset to.')
args = parser.parse_args()

workspace = Workspace.from_config()

dataset = Dataset.get_by_name(workspace, args.dataset_name)
dataset.download(args.output_folder, overwrite=True)

Now, we will create a script for training the model with mlflow

In [None]:
%%writefile train_with_mlflow.py
import tempfile
from argparse import ArgumentParser
from typing import Any
from datetime import datetime

import mlflow
import pandas as pd
from joblib import dump
from azureml.core import Workspace
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

parser = ArgumentParser()
parser.add_argument("--train_dataset", type=str, default="data/train.csv")
parser.add_argument("--test_dataset", type=str, default="data/test.csv")
parser.add_argument("--n_cross_vals", type=int, default=5)
parser.add_argument("--max_depth", default=[2, 5, 10, None])
parser.add_argument("--n_estimators", default=[10, 25, 100])
parser.add_argument("--criterion", default=["gini", "entropy"])

args = parser.parse_args()
# This can be used to log all metrics and artifacts that are generated by the model.
# mlflow.autolog(log_models=True)
workspace = Workspace.from_config()

df_train = pd.read_csv(args.train_dataset)
df_test = pd.read_csv(args.test_dataset)

# We define the hyperparameters we want to tune
param_grid = {
    "n_estimators": args.n_estimators,
    "criterion": args.criterion,
    "max_depth": args.max_depth,
}
# We log the selected hyper-parameters to azureml using mlflow
# You can find the best hyper-parameters in the azureml UI under parameters.

mlflow.set_tracking_uri(workspace.get_mlflow_tracking_uri())

with mlflow.start_run(
    run_name=f"Moon_training-{datetime.now().strftime('%m/%d/%Y,%H:%M:%S')}"
) as run:
    run_id = run.info.run_id
    for param, value in param_grid.items():
        mlflow.log_param(f"gridsearch/{param}", str(value))

    model = RandomForestClassifier()
    grid_search = GridSearchCV(model, param_grid, cv=args.n_cross_vals, n_jobs=-1)

    # We train the model
    grid_search.fit(df_train[["x1", "x2"]], df_train["y"])
    model = grid_search.best_estimator_

    # Here we evaluate the model
    predictions = model.predict(df_test[["x1", "x2"]])
    test_accuracy = accuracy_score(df_test["y"], predictions)

    # We log the accuracy to azureml using mlflow
    # You can see the logged metrics in the azureml UI under the "Metrics" tab
    mlflow.log_metric("test_accuracy", test_accuracy)

    # We log the selected hyper-parameters to azureml using mlflow
    # You can find the best hyper-parameters in the azureml UI under parameters.
    for k, v in grid_search.best_params_.items():
        mlflow.log_param(f"selected/{k}", v)
    
    # Export the model and log it to azureml using mlflow
    with tempfile.TemporaryDirectory() as tmp_dir:
        dump(model, f"{tmp_dir}/model.joblib")
        mlflow.log_artifact(f"{tmp_dir}/model.joblib")


Finally, we will create a script to register the model.

In [None]:
%%writefile register_model.py
from argparse import ArgumentParser
import mlflow
from azureml.core import Workspace, Run

parser = ArgumentParser()
parser.add_argument(
    "--model_name", 
    type=str, 
    required=True, 
    help="Name of the model to register.")
parser.add_argument(
    "--run_id",
    type=str,
    default=None,
    required=False,
    help=(
        "ID AzureML has given the run."
        "You can find in the UI under raw json properties. "
        "If not provided, it will try to get it from the mlflow context."
    ),
)

args = parser.parse_args()

model_name = args.model_name
if args.run_id is not None:
    run_id =  args.run_id
run_id = Run.get_context(allow_offline=False)
run_id = run_id.id

ws = Workspace.from_config()
# Set the tracking URI to the AzureML workspace
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

# the artifact path is the path where the model artifact is stored within the run.
artifact_path = "model.joblib"
model_uri = f"runs:/{run_id}/{artifact_path}"
mlflow.register_model(model_uri=model_uri, name=model_name)


Note that in all the three scripts, we are creating Workspace object from config. We need to keep the config file along the scripts, so that the scripts can use it for authenticating the Workspace. We can do this by: 

In [None]:
workspace.write_config(file_name='./config.json')

Now that we have the scripts, we can start running the script on out compute instance.

In [None]:
source_directory = "./"

In [None]:
commands = [
    "python download_dataset.py --dataset_name moon_dataset --output_folder data",
    "python train_with_mlflow.py --train_dataset data/train.csv --test_dataset data/test.csv",
    "python register_model.py --model_name moon_model",
]

In [None]:
# Here we combine all the configuration
script_run_config = ScriptRunConfig(
    source_directory=str(source_directory),
    command=" && ".join(commands),
    compute_target=compute_name,
    environment=env,
)

In [None]:
# Here we submit the configuration as an experiment job to AzureML.
experiment.submit(script_run_config)
print("You follow the experiment here:")
print(experiment.get_portal_url())