# Azure ML Studio Pipeline Template

# Connect to your Workspace

In [None]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

# Prepare Datastore
### View Datastores

In [None]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

## TODO: Create New Datastore

### Upload Data to Datastore

In [None]:
list_of_files = ['./data/diabetes.csv', './data/diabetes2.csv']    # replace this!
datastore_folder = 'diabetes-data/'        # rename this!


default_ds.upload_files(files=list_of_files, # Upload the diabetes csv files in /data
                        target_path=, datastore_folder # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

# Create Pipeline Scripts
### Create a folder for pipeline scripts

In [None]:
import os

# Create a folder for the pipeline step files
experiment_folder = # 'my_project'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

### Model Training Script

In [None]:
%%writefile $experiment_folder/train_model.py
# Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


# Get the experiment run context
run = Run.get_context()

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', 
                    default="trained_model", help='output folder')
parser.add_argument('--regularization', type=float, dest='reg_rate', 
                    default=0.01, help='regularization rate')
parser.add_argument('--data-folder', type=str, dest='data_folder', 
                    help='data folder reference')

args = parser.parse_args()
output_folder = args.output_folder
reg = args.reg_rate

# load the diabetes data from the data reference
data_folder = args.data_folder
print("Loading data from", data_folder)
# Load all files and concatenate their contents as a single dataframe
all_files = os.listdir(data_folder)
training_data = pd.concat((pd.read_csv(os.path.join(data_folder,csv_file)) for csv_file in all_files))

# Separate features and labels
X, y = training_data[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                      'TricepsThickness', 'SerumInsulin','BMI','DiabetesPedigree',
                      'Age']].values, training_data['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train adecision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model
os.makedirs(output_folder, exist_ok=True)
output_path = output_folder + "/model.pkl"
joblib.dump(value=model, filename=output_path)

run.complete()

### Model Registration Script

In [None]:
%%writefile $experiment_folder/register_model.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder', 
                    default="trained_model", help='model location')
args = parser.parse_args()
model_folder = args.model_folder

# Get the experiment run context
run = Run.get_context()

# load the model
print("Loading model from " + model_folder)
model_file = model_folder + "/model.pkl"
model = joblib.load(model_file)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'classifier_model',        ## rename this!
               tags={'Training context':'Pipeline'})

run.complete()

# Configuration (Environment & Compute)

### Configure a Compute Cluster

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aml-cluster"        ## rename this!

# Verify that cluster exists
try:
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If not, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',        ## customize this!
                                                           max_nodes=4,
                                                           idle_seconds_before_scaledown=1800)
    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

pipeline_cluster.wait_for_completion(show_output=True)

### Configure a Python Environment

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a Python environment for the experiment
project_env = Environment("my-project-env")        ## rename this! ##
project_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
project_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
## Note: Use pipreqs to get a list of packages and versions used
##       Use conda to install wherever possible
## To Do: add conda channels to CondaDependencies object
project_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],
                                            pip_packages=['azureml-sdk'])

# Add the dependencies to the environment
project_env.python.conda_dependencies = project_packages

# Register the environment (just in case you want to use it again)
project_env.register(workspace=ws)

# Build and Run a Pipeline
### Define a Run Configuration

In [None]:
from azureml.core.runconfig import RunConfiguration

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
## TODO: GET CLUSTER FROM LIST OF WORKSPACE CLUSTERS
## (TREAT EACH OF THESE CELLS AS INDEPENDENT CODE SAMPLES)
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
registered_env = Environment.get(ws, 'my-project-env')
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

### Define Pipeline Steps

In [None]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")
# OR get a reference to the datastore
data_ref = default_ds.path('diabetes-data').as_download(path_on_compute='diabetes_data')    ## rename data folder


# Create a PipelineData (temporary Data Reference) for the model folder
model_folder = PipelineData("model_folder", datastore=ws.get_default_datastore())

estimator = Estimator(source_directory=experiment_folder,
                      compute_target = pipeline_cluster,
                      environment_definition=pipeline_run_config.environment,
                      entry_script='train_model.py')

# Step 1, run the estimator to train the model
script_params = {
    '--regularization': 0.1, # regularization rate
    '--data-folder': data_ref # data reference to download files from datastore
}

train_step = EstimatorStep(name = "Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--output_folder', model_folder,
                                                             '--regularization', 0.1,
                                                             '--data-folder', data_ref],
                           #inputs=[diabetes_ds.as_named_input('diabetes_train')],
                           outputs=[model_folder],
                           compute_target = pipeline_cluster,
                           allow_reuse = True)

# Step 2, run the model registration script
register_step = PythonScriptStep(name = "Register Model",
                                 source_directory = experiment_folder,
                                 script_name = "register_model.py",
                                 arguments = ['--model_folder', model_folder],
                                 inputs=[model_folder],
                                 compute_target = pipeline_cluster,
                                 runconfig = pipeline_run_config,
                                 allow_reuse = True)

print("Pipeline steps defined")

### Create and Run the Pipeline

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'my-training-pipeline')        # rename this!
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")

RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

### Verify Model has been Registered Successfully

In [None]:
from azureml.core import Model

# Verify that the trained model is in the workspace model registry.
# If the model is already registered and is being retrained, run this
# before and after the pipeline, and verify that the model version 
# has been incremented.

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 
# APPENDIX

1. Create and Register Datasets
8. Run a Standalone Experiment
6. View Experiment Progress and History
3. Publish the Pipeline

## Create and Register Datasets

In [None]:
# VIEW DATASETS

print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)

In [None]:
# CREATE A TABULAR DATASET

from azureml.core import Dataset

# Get the default datastore
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

# Display the first 20 rows as a Pandas dataframe
tab_data_set.take(20).to_pandas_dataframe()

In [None]:
# CREATE A FILES DATASET

#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))

# Get the files in the dataset
for file_path in file_data_set.to_path():
    print(file_path)

In [None]:
# REGISTER DATASETS

# Register the tabular dataset
try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                         name='diabetes dataset',
                                         description='diabetes data',
                                         tags = {'format':'CSV'},
                                         create_new_version=True)
except Exception as ex:
    print(ex)

# Register the file dataset
try:
    file_data_set = file_data_set.register(workspace=ws,
                                           name='diabetes file dataset',
                                           description='diabetes files',
                                           tags = {'format':'CSV'},
                                           create_new_version=True)
except Exception as ex:
    print(ex)

print('Datasets registered')

In [None]:
# UPLOAD AND REGISTER DATASETS

from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(
        files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
        target_path='diabetes-data/', # Put it in a folder path in the datastore
        overwrite=True, # Replace existing files of the same name
        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                             name='diabetes dataset',
                                             description='diabetes data',
                                             tags = {'format':'CSV'},
                                             create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

## Run Standalone Experiment

In [None]:
# CONFIGURE AND SUBMIT AN EXPERIMENT (OUTSIDE OF PIPELINES)

from azureml.train.sklearn import SKLearn
from azureml.widgets import RunDetails


script_params = {'--reg_rate': 0.1}

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Get the environment
diabetes_env = Environment.get(ws, 'diabetes-experiment-env')

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local',  # or cluster name
                    inputs=[diabetes_ds.as_named_input('diabetes')], 
                    environment_definition = diabetes_env
                   )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

In [None]:
# REGISTER TRAINED MODEL

from azureml.core import Model

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Estimator'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

## View Experiment Progress and History

In [None]:
# VIEW EXPERIMENT PROGRESS

from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
# VIEW EXPERIMENT RESULTS

import json

# Get run details
details = run.get_details()
print(details)

# Get logged metrics
metrics = run.get_metrics()
print(json.dumps(metrics, indent=2))

# Get output files
files = run.get_file_names()
print(json.dumps(files, indent=2))

In [None]:
# VIEW EXPERIMENT RUN HISTORY

from azureml.core import Experiment, Run

diabetes_experiment = ws.experiments['diabetes-experiment']
for logged_run in diabetes_experiment.get_runs():
    print('Run ID:', logged_run.id)
    metrics = logged_run.get_metrics()
    for key in metrics.keys():
        print('-', key, metrics.get(key))

## Publish the Pipeline

In [None]:
# PUBLISH THE PIPELINE AS A REST SERVICE

published_pipeline = pipeline.publish(name="Diabetes_Training_Pipeline",
                                      description="Trains diabetes model",
                                      version="1.0")
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

In [None]:
# GET AUTHORIZATION HEADER

from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

In [None]:
# CALL THE REST INTERFACE

import requests
experiment_name = 'Run-diabetes-pipeline'

response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id