# Model Training Workflow in Azure Machine Learning

## Preparation

1. Download the workspace config file `config.json` from the Azure ML Workspace page
   and save it in the same or parent directories.
   This will be used in connecting to the workspace.
1. Make sure the training data is avilable in Azure ML Data Asset,
   or download a new version and save it in the `data/` directory.
1. Make sure a compute target is avilable in Azure ML Workspace.

Ref: [Tutorial: Create production ML pipelines with Python SDK v2 (preview) in a Jupyter notebook](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-pipeline-python-sdk)

## Global configuration

In [None]:
""" Data registration
To upload new data, set REGISTER_DATA to True, DATA_VER to a new value.
Otherwise, set REGISTER_DATA to False, DATA_VER to an existing value.
"""
REGISTER_DATA = False
DATA_NAME = 'stock_prices'
DATA_VER = '1'
DATA_UPLOAD_PATH = './data/'

""" Compute target
"""
COMPUTE_TARGET_NAME = 'george-mlops-compute-cluster'

""" Environment
To create environment, set REGISTER_ENV to True, ENV_VER to a new value.
Otherwise, set REGISTER_ENV to False, ENV_VER to an existing value.
"""
REGISTER_ENV = False
ENV_NAME = 'tensorflow_sklean_cpu'
ENV_VER = '1.0'

""" Scaler & model
"""
# WINDOW=50
# TEST_RATIO=0.2
SCALER_FILE_NAME = 'scaler.pkl'
TRAIN_DATA_X_FILE_NAME = 'x_train.npy'
TRAIN_DATA_Y_FILE_NAME = 'y_train.npy'
TEST_DATA_X_FILE_NAME = 'x_test.npy'
TEST_DATA_Y_FILE_NAME = 'y_test.npy'

""" Pipeline & experiment
"""
EXPERIMENT_NAME = 'stock-pred-model-train'

## Connect to the workspace

In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

ml_client = MLClient.from_config(credential=credential)

## Register data

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

if REGISTER_DATA:
    train_data = Data(
        name=DATA_NAME,
        version=DATA_VER,
        description='Stock market prediction training data',
        type=AssetTypes.URI_FOLDER,
        path=DATA_UPLOAD_PATH
    )

    train_data = ml_client.data.create_or_update(train_data)
    print(
        f'Dataset with name {train_data.name} was registered to workspace, '
        f'the dataset version is {train_data.version}'
    )

## Get a compute resource to run pipeline

In [None]:
cpu_cluster = ml_client.compute.get(COMPUTE_TARGET_NAME)
print(f'Compute target {COMPUTE_TARGET_NAME} found')

## Create or get a job environment to run pipeline

In [None]:
from azure.ai.ml.entities import Environment

if REGISTER_ENV:
    my_env = Environment(
        name=ENV_NAME,
        version=ENV_VER,
        description='TensorFlow 2.x and scikit-learn 1.x on CPU',
        conda_file='./env_azure.yml',
        image='mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest'
    )

    my_env = ml_client.environments.create_or_update(my_env)

    print(
        f'Environment with name {my_env.name} is registered to workspace, '
        f'the environment version is {my_env.version}'
    )
else:
    my_env = ml_client.environments.get(name=ENV_NAME, version=ENV_VER)

    print(
        f'Environment with name {my_env.name} is found in workspace, '
        f'the environment version is {my_env.version}'
    )

## Create components (pipeline steps)

In [None]:
from azure.ai.ml.entities import CommandComponent
from azure.ai.ml import Input, Output

data_prep_comp = CommandComponent(
    name='stock_pred_data_prep',
    display_name='Preprocess data for training',
    description='reads raw price data, normalize and split the data',
    inputs={
        'data': Input(type='uri_folder', mode='ro_mount'),
        # the inputs below will cause error "Input string was not in a correct format"
        # 'test_ratio': Input(type='number', default=TEST_RATIO),
        # 'window': Input(type='number', default=WINDOW)
    },
    outputs={
        'scaler': Output(type='uri_file'),
        'train_data_x': Output(type='uri_file'),
        'train_data_y': Output(type='uri_file'),
        'test_data_x': Output(type='uri_file'),
        'test_data_y': Output(type='uri_file')
    },
    # TODO: reorganize code to minimize the code context
    code='.',
    command='''PYTHONPATH=$PYTHONPATH:$(pwd) \
               python azure_pipeline/preproc_data/preproc_data.py \
                   --data=${{inputs.data}} --test_ratio=0.2 \
                   --window=50 \
                   --scaler=${{outputs.scaler}} \
                   --train_data_x=${{outputs.train_data_x}} --train_data_y=${{outputs.train_data_y}} \
                   --test_data_x=${{outputs.test_data_x}} --test_data_y=${{outputs.test_data_y}}
            ''',
    environment=f'{my_env.name}:{my_env.version}'
)

data_prep_comp = ml_client.components.create_or_update(data_prep_comp)

In [None]:
train_comp = CommandComponent(
    name='stock_pred_model_train',
    display_name='Train model',
    description='train model and test model',
    inputs={
        'scaler': Input(type='uri_file'),
        'train_data_x': Input(type='uri_file'),
        'train_data_y': Input(type='uri_file'),
        'test_data_x': Input(type='uri_file'),
        'test_data_y': Input(type='uri_file')
    },
    outputs={
        'model': Output(type='uri_folder'),
        'tensorboard': Output(type='uri_folder')
    },
    code='.',
    command='''PYTHONPATH=$PYTHONPATH:$(pwd) \
               python azure_pipeline/train_model/train_model.py \
                   --window=50 --epochs=15 --batch=20 \
                   --scaler=${{inputs.scaler}} \
                   --train_data_x=${{inputs.train_data_x}} --train_data_y=${{inputs.train_data_y}} \
                   --test_data_x=${{inputs.test_data_x}} --test_data_y=${{inputs.test_data_y}} \
                   --model=${{outputs.model}} \
                   --tensorboard=${{outputs.tensorboard}}
            ''',
    environment=f'{my_env.name}:{my_env.version}'
)

train_comp = ml_client.components.create_or_update(train_comp)

## Create the pipeline from components

In [None]:
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute=COMPUTE_TARGET_NAME,
    description='Stock prediction lstm model training pipeline',
)
def model_train_pipeline(
    pipeline_data_input
):
    data_prep_job = data_prep_comp(
        data=pipeline_data_input
    )

    train_job = train_comp(
        scaler=data_prep_job.outputs.scaler,
        train_data_x=data_prep_job.outputs.train_data_x,
        train_data_y=data_prep_job.outputs.train_data_y,
        test_data_x=data_prep_job.outputs.test_data_x,
        test_data_y=data_prep_job.outputs.test_data_y
    )

    return {
        'scaler': data_prep_job.outputs.scaler,
        'model': train_job.outputs.model
    }

## Run the pipeline

In [None]:
import webbrowser

data_input = ml_client.data.get(DATA_NAME, DATA_VER)

pipeline = model_train_pipeline(
    pipeline_data_input=Input(type='uri_folder', path=data_input.path)
)

pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name=EXPERIMENT_NAME
)

# open the pipeline in web browser
webbrowser.open(pipeline_job.services['Studio'].endpoint)