In [42]:
# imports
import azureml.core

In [43]:
# assign current workspace
from azureml.core import Workspace
ws = Workspace.from_config()
print('Current workspace:', ws.name)

Current workspace: ml-workspace


In [44]:
# assign datastore
ds = ws.get_default_datastore()
print('Current datastore:', ds.name)

Current datastore: workspaceblobstore


In [45]:
# upload the file to blob
ds.upload_files(files=['./data/heart.csv'], target_path='data-heart/', overwrite=True, show_progress=True)

Uploading an estimated of 1 files
Uploading ./data/heart.csv
Uploaded ./data/heart.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_e443c5dbeec54c4e9850737773070739

In [46]:
# register the dataset
from azureml.core import Dataset

# name of our dataset
ds_name = 'heart dataset'

if 'heart dataset' not in ws.datasets:
    # create tabular dataset from the data
    heart_data = Dataset.Tabular.from_delimited_files(path=(ds, 'data-heart/*.csv'))

    # register the dataset
    try:
        heart_data = heart_data.register(workspace=ws,
                                         name = ds_name,
                                         description= 'Heart Attach Data',
                                         tags = {'format' : 'csv'},
                                         create_new_version = True)
        print('Dataset %s registered with version %i.'%(heart_data.name, str(heart_data.version)))
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')


Dataset already registered.


In [47]:
# Create a local folder for the pipeline step files

import os
experiment_folder = './heart_pipeline'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder)

./heart_pipeline


## Pipeline Step 1: Prep data

In [48]:
%%writefile $experiment_folder/01_prep_heart.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type=str, dest='raw_dataset_id',
                    help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data',
                    default='prepped_data',
                    help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get experiment run context
run = Run.get_context()

# Load data
print('Loading Data...')
heart_df = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw dataset details
run.log('Raw rows', heart_df.shape[0])
run.log('Raw columns', heart_df.shape[1])

# Drop NAs
heart_df = heart_df.dropna()

# Modifying prediction label
heart_df.rename(columns={'output' : 'heart_attack'}, inplace=True)

# Change sex categorical feature to dummies
sex_type = pd.get_dummies(data=heart_df['sex'])
sex_type.columns = ['Male', 'Female']
chest_pain = pd.get_dummies(data=heart_df['cp'])
chest_pain.columns = ['Chest Pain 1', 'Chest Pain 2', 'Chest Pain 3',
                      'Chest Pain 4']
ex_angina = pd.get_dummies(data=heart_df['exng'])
ex_angina.columns = ['Exercise Angina: No', 'Exercise Angina: Yes']
slp_type = pd.get_dummies(data=heart_df['slp'])
slp_type.columns = ['Slope 1', 'Slope 2', 'Slope 3']
caa_type = pd.get_dummies(data=heart_df['caa'])
caa_type.columns = ['CAA 1', 'CAA 2', 'CAA 3', 'CAA 4', 'CAA 5']
thall_type = pd.get_dummies(data=heart_df['thall'])
thall_type.columns = ['Thall 1', 'Thall 2', 'Thall 3', 'Thall 4']


# Joining and removing modified columns
heart_df = pd.concat([sex_type, chest_pain, ex_angina, slp_type,
                      caa_type, thall_type, heart_df],
                     axis=1)
heart_df.drop(labels=['sex', 'cp', 'exng', 'slp', 'caa', 'thall'], 
              axis=1, inplace=True)

# Normalize numeric columns
scaler = MinMaxScaler()
norm_cols = ['age', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
                'oldpeak']
heart_df[norm_cols] = scaler.fit_transform(heart_df[norm_cols])

# Log prepped dataset details
run.log('Prepped rows', heart_df.shape[0])
run.log('Prepped columns', heart_df.shape[1])

# Save the prepped data
print('Saving Data...')
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder, 'heartdata_prepped.csv')
heart_df.to_csv(save_path, index=False, header=True)

# End the run
run.complete()


Overwriting ./heart_pipeline/01_prep_heart.py


## Pipeline Step 2: Train model

In [49]:
%%writefile $experiment_folder/02_train_heart.py

# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--training-data',
                    type=str,
                    dest='training_data',
                    help='training_data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# Load the data
print('Loading Data...')
file_path = os.path.join(training_data, 'heartdata_prepped.csv')
heart = pd.read_csv(file_path)

# Separate features from labels
X, y = heart.iloc[:,0:-1].values, heart.iloc[:,-1]

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=0)

# Train model
print('Training model...')
model = DecisionTreeClassifier().fit(X_train, y_train)

# Calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# Calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:,1])
print('AUC:', auc)
run.log('AUC', np.float(auc))

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6,4))
plt.plot([0,1], [0,1], 'k--') # diagonal 50% line
plt.plot(fpr, tpr, 'r--')
plt.xlabel('FP rate')
plt.ylabel('TP rate')
plt.title('ROC Curve')
run.log_image(name='ROC', plot=fig)
plt.show()

# Save trained model 
print('Saving model...')
os.makedirs('./outputs', exist_ok=True)
model_file = os.path.join('./outputs', 'heart_model.pkl')
joblib.dump(value=model, filename=model_file)

# Register model
print('Registering model...')
Model.register(workspace = run.experiment.workspace,
               model_path = model_file,
               model_name = 'heart_model',
               tags = {'Context' : 'Pipeline',
                       'Purpose' : 'DP100'},
               properties = {'AUC' : np.float(auc),
                             'Accuracy' : np.float(acc)})
               
run.complete()


Overwriting ./heart_pipeline/02_train_heart.py


## Prep compute environment

This is only necessary if a compute environment is required (i.e. it will not run locally).

In [58]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'chryssCluster'

try:
    pipeline_cluster = ComputeTarget(workspace=ws,
                                     name=cluster_name)
    print('Found existing cluster, using it.')
except:
    # It does not exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_DS11_V2',
            max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, using it.


## Create conda environment config

This will be installed on the compute target.

In [52]:
%%writefile $experiment_folder/heart_experiment_env.yml
name: heart_experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
    - azureml-defaults
    - pyarrow

Overwriting ./heart_pipeline/heart_experiment_env.yml


## Create run configuration: Environment & Pipeline

In [59]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

heart_experiment_env = Environment.from_conda_specification(
    'heart_experiment_env', experiment_folder + '/heart_experiment_env.yml')

# Register environment
heart_experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'heart_experiment_env')

# Create RunConfig object for pipeline
pipeline_run_config = RunConfiguration()
pipeline_run_config.target = pipeline_cluster
pipeline_run_config.environment = registered_env

print('Run configuration created.')

Run configuration created.


## Create and run pipeline

In [60]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline

# Get the dataset
heart_ds = ws.datasets.get('heart dataset')

# Create OutputFileDatasetConfig (temp data)
prepped_data = OutputFileDatasetConfig('prepped_data')

# Create steps
prep_step = PythonScriptStep(name = 'Prepare Data',
                                source_directory = experiment_folder,
                                script_name = '01_prep_heart.py',
                                arguments = ['--input-data', heart_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)
train_step = PythonScriptStep(name = 'Train and Register Model',
                                 source_directory = experiment_folder,
                                 script_name = '02_train_heart.py',
                                 arguments = ['--training-data',
                                              prepped_data.as_input()],
                                 compute_target = pipeline_cluster,
                                 runconfig = pipeline_run_config,
                                 allow_reuse = True)



print('Pipeline steps defined.')

# Construct pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print('Pipeline is built.')


Pipeline steps defined.


ValueError: Please specify a remote compute_target. (Local execution is not supported for pipelines.)

## Build Pipeline and Run as Experiment

In [56]:
from azureml.core import Experiment
from azureml.widgets import RunDetails

# Create an experiment and run pipeline
experiment = Experiment(workspace=ws, name='heart_pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print('Pipeline submitted for execution.')
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=False)


Created step Prepare Data [a78d125a][630e597f-bda4-4e70-8cfd-192aba194723], (This step will run and generate new outputs)Created step Train and Register Model [485b1955][54fd40ec-95d2-4e80-a14c-e8bb8568b70c], (This step will run and generate new outputs)

Submitted PipelineRun 4064b1ee-5145-4bc7-9c29-6577b96a7c62
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4064b1ee-5145-4bc7-9c29-6577b96a7c62?wsid=/subscriptions/1b50f243-9e15-4373-91f7-59060f79af8a/resourcegroups/DP-100/workspaces/ml-workspace&tid=186e418a-457a-46b9-aa3b-0336d2e46f5b
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 4064b1ee-5145-4bc7-9c29-6577b96a7c62
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4064b1ee-5145-4bc7-9c29-6577b96a7c62?wsid=/subscriptions/1b50f243-9e15-4373-91f7-59060f79af8a/resourcegroups/DP-100/workspaces/ml-workspace&tid=186e418a-457a-46b9-aa3b-0336d2e46f5b
{'runId': '4064b1ee-5145-4bc7-9c29-6577b96a7c62', 'status': 'Completed', 'startTimeUtc': '2021-07-22T11:26:29.703057Z', 'endTimeUtc': '2021-07-22T11:33:58.562461Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlworkspace1398408280.blob.core.windows.net/azureml/ExperimentRun/dcid.4064b1ee-5145-4bc7-9c29-6577b96a7c62/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=dDXncUz5EHggXMjCOIUfEe9NFxNgln8QlA%2Fskx2YDdE%3D&st=2021-07-22T11%3A16%3A52Z&se=2021-07-22T19%3A26%3A52Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https:

'Finished'

## Examine Experiment run metrics

In [57]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t', metric_name, ':', metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.7692307692307693
	 AUC : 0.7678916827852998
	 ROC : aml://artifactId/ExperimentRun/dcid.cd8085b2-58e9-4a1e-ba14-29a8bb10533c/ROC_1626953622.png
Prepare Data :
	 Raw rows : 303
	 Raw columns : 14
	 Prepped rows : 303
	 Prepped columns : 28


In [24]:
for model in Model.list(workspace=ws, name='heart_model'):
    print(model.name)
    print(model.tags)
    print(model.properties)

heart_model
{'Context': 'Pipeline', 'Purpose': 'DP100'}
{'AUC': '0.745164410058027', 'Accuracy': '0.7472527472527473'}


## Publish the pipeline as REST service

In [25]:
pub_pipeline = pipeline_run.publish_pipeline(
    name = 'heart-training-pipeline',
    description = 'Trains heart model',
    version = '1.0'
)
pub_pipeline

Name,Id,Status,Endpoint
heart-training-pipeline,e9ad0b74-22cf-444e-a29b-3bb1ad7d1e64,Active,REST Endpoint


In [27]:
rest_endpoint = pub_pipeline.endpoint
print(rest_endpoint)

https://westeurope.api.azureml.ms/pipelines/v1.0/subscriptions/1b50f243-9e15-4373-91f7-59060f79af8a/resourceGroups/DP-100/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace/PipelineRuns/PipelineSubmit/e9ad0b74-22cf-444e-a29b-3bb1ad7d1e64


## Consume Endpoint

We need to get the authorization header in order to access the endpoint; this will be provided in the request.

In [38]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Authentication header is ready.')

Authentication header is ready.


In [39]:
import requests

experiment_name = 'heart_pipeline' # name of the pipeline
rest_endpoint = pub_pipeline.endpoint # endpoint
response = requests.post(rest_endpoint,
                         headers=auth_header,
                         json={'ExperimentName' : experiment_name})
run_id = response.json()['Id']
print('RunID:', run_id)

RunID: b3259353-3971-42a4-b2dd-c9d32b917a55


We use this RunID to get the data from the PipelineRun.  
**Note:** This should complete quickly since each step was configured to allow output reuse.

In [40]:
from azureml.pipeline.core.run import PipelineRun

pub_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
pub_pipeline_run.wait_for_completion(show_output=True)


PipelineRunId: b3259353-3971-42a4-b2dd-c9d32b917a55
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b3259353-3971-42a4-b2dd-c9d32b917a55?wsid=/subscriptions/1b50f243-9e15-4373-91f7-59060f79af8a/resourcegroups/DP-100/workspaces/ml-workspace&tid=186e418a-457a-46b9-aa3b-0336d2e46f5b
PipelineRun Status: Running

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'b3259353-3971-42a4-b2dd-c9d32b917a55', 'status': 'Completed', 'startTimeUtc': '2021-07-21T11:35:58.949229Z', 'endTimeUtc': '2021-07-21T11:36:02.265859Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.pipelineid': 'e9ad0b74-22cf-444e-a29b-3bb1ad7d1e64'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlworkspace1398408280.blob.core.windows.net/azureml/ExperimentRun/dcid.b3259353-3971-42a4-b2dd-c9d32b917a55/logs/azureml/executionlogs.txt?sv=2019-02-02&

'Finished'

## Scheduling and get the latest run

Notebook "08 - Create a Pipeline" details how to schedule in Python.

We will not set it up here, but if it were, we would need to fetch the details from the latest run. This is done below.

In [41]:
pipeline_experiment = ws.experiments.get('heart_pipeline')
latest_run = list(pipeline_experiment.get_runs())[0]
latest_run.get_details()

{'runId': 'b3259353-3971-42a4-b2dd-c9d32b917a55',
 'status': 'Completed',
 'startTimeUtc': '2021-07-21T11:35:58.949229Z',
 'endTimeUtc': '2021-07-21T11:36:02.265859Z',
 'properties': {'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'Unavailable',
  'runType': 'HTTP',
  'azureml.parameters': '{}',
  'azureml.pipelineid': 'e9ad0b74-22cf-444e-a29b-3bb1ad7d1e64'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlworkspace1398408280.blob.core.windows.net/azureml/ExperimentRun/dcid.b3259353-3971-42a4-b2dd-c9d32b917a55/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=7uOn%2FijFtVRdOriiEEkm2RaeMBniSvIpuj59Hk1C5uc%3D&st=2021-07-21T11%3A26%3A03Z&se=2021-07-21T19%3A36%3A03Z&sp=r',
  'logs/azureml/stderrlogs.txt': 'https://mlworkspace1398408280.blob.core.windows.net/azureml/ExperimentRun/dcid.b3259353-3971-42a4-b2dd-c9d32b917a55/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=HRbFWZUTOW90S%2BpTQKA2OpN3L5Jm9GsdFzC3FXaWp%2B8%3