In [1]:
# Project 2 of the Udacity Nanodegree - Operationalizing Machine Learning
# The Bank Marketing data will be used - the challenge is to make a determination if the customer will make (or not make) a term deposit
# Gather all imports here
from azureml.core.workspace import Workspace
from azureml.core import Datastore
from azureml.core.compute import ComputeTarget
from azureml.exceptions import ComputeTargetException
from azureml.core.compute.amlcompute import AmlCompute
from azureml.core.experiment import Experiment
from azureml.core.dataset import Dataset
from azureml.core.run import Run

import pandas as pd

from azureml.train.automl.automlconfig import AutoMLConfig

from azureml.pipeline.steps.automl_step import AutoMLStep
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import TrainingOutput
from azureml.pipeline.core.pipeline import Pipeline
from azureml.pipeline.core import PublishedPipeline

from azureml.pipeline.core.run import PipelineRun

from azureml.widgets.run_details import RunDetails

from azureml.core.webservice import Webservice
from azureml.core.authentication import InteractiveLoginAuthentication

import requests

$Step$ $2$
# Start of Step 2 - Automated ML Experiment

In [2]:
# Various Initializations

# projct_folder is needed as part of teh AutoMLConfig
PROJECT_FOLDER = 'operationalizing-ml'
PROJECT_DEBUG_LOG = 'operationalizingml.log'
PROJECT_LABEL_COLUMN_NAME = 'y'

PROJECT_AUTOMLSTEP_NAME = 'AutoML Training Step'

PROJECT_EXPERIMENT_NAME_STEP2 = 'exp-project2-step2'
PROJECT_EXPERIMENT_NAME_STEP7 = 'exp-project2-step7'

PROJECT_DATASET_NAME = 'Marketing Bank Data'

PROJECT_DEPLOYED_MODEL_NAME = 'bank-term-deposit' # Deployed Model name (endpoine name) must be lower case and only include -

PROJECT_PIPELINEDATA_METRICS_NAME = 'PipelineData_Metrics' # can only contain letters, digits and _
PROJECT_PIPELINEDATA_MODEL_NAME = 'PipelineData_Model' # can only contain letters, digits and _
PROJECT_PIPELINE_OUTPUT_METRICS_NAME = 'Pipeline Metrics Output' # Must be unique in a pipeline
PROJECT_PIPELINE_OUTPUT_MODEL_NAME = 'Pipeline Model Output' # Must be unique in a pipeline
PROJECT_PIPELINE_DESCRIPTION = 'AutoML Pipeline to train model on the marketing bank data'
PROJECT_EXPERIMENT_NAME = 'AutoML Train Banking Data Experiment'

PROJECT_PUBLISHED_PIPELINE_NAME = 'Marketing Bank Training Pipeline'
PROJECT_PUBLISHED_PIPELINE_DESCRIPTION = 'This pipeline trains on teh Marketing Bank Data'
PROJECT_PUBLISHED_PIPELINE_VERSION='1.0'

PROJECT_CONSUME_PIPELINE_ENDPOINT_EXPERIMENT = 'exp-run-pipeline' # 

In [3]:
# Get the workspace object which will be needed subsequently for most operations
ws = Workspace.from_config()

# Print some basic information from the workdpace as a FYI
print(f'name:{ws.name}, location:{ws.location}\nresource group:{ws.resource_group}, subscription id:{ws.subscription_id}')


name:quick-starts-ws-153756, location:southcentralus
resource group:aml-quickstarts-153756, subscription id:f9d5a085-54dc-4215-9ba6-dad5d86e60a0


In [4]:
# Retrive the datastore (we will use the default datastore)
ds = ws.get_default_datastore()

In [5]:
# Next, let's use if it exists, or create if required, a compute cluster to be used by the ML
cc_name = "CPU-CC"  # CPU Compute Cluster

# Access the compute cluster. If it exists, we will have the compute object. If it does not exist, an exception will be thrown upon which the compute cluster can be created
try:
    project_cc = ComputeTarget(workspace=ws, name=cc_name)
    print(f'Compute Cluster target exists and we have a handle to the same')
except ComputeTargetException:
    # Failed to obtain the compute cluster object
    # In all likelihood, a compute cluster of that name has not been created
    # Attempt to create the compute cluster
    # First set up the configuration

    # Specify the configuration of the compute cluster
    cc_cfg = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_v2', min_nodes=1, max_nodes=6)
    project_cc = ComputeTarget.create(workspace=ws, name=cc_name, provisioning_configuration=cc_cfg)

# At this point - we have access to the compute cluster object. Wait for the compute target to complete provisioing
project_cc.wait_for_completion(show_output='True')



InProgress.....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded...............
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [6]:
# Create an experiment
proj_experiment_step2 = Experiment(workspace=ws, name=PROJECT_EXPERIMENT_NAME_STEP2)

In [7]:
proj_experiment_step2

Name,Workspace,Report Page,Docs Page
exp-project2-step2,quick-starts-ws-153756,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
print(f'{proj_experiment_step2}')

Experiment(Name: exp-project2-step2,
Workspace: quick-starts-ws-153756)


In [9]:
# grab the data and create a dataset
# we will use logic similar to the compute target above
# See if the dataset already exists - if so, skip the Dataset creation pieces
# The Bank Marketing data may be found at - https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv
data_uri = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'

ds_name = PROJECT_DATASET_NAME
dsets = ws.datasets.keys()

if ds_name in dsets:
    # dataset exists
    proj_ds = dsets[ds_name]
else:
    # Data set not found. Must create it
    proj_ds = Dataset.Tabular.from_delimited_files(data_uri)
    # Register the dataset so tat on repeated runs, teh data does not have to be fethed evey time
    proj_ds.register(workspace=ws, name=ds_name, description='Marketing Bank data')

    

In [10]:
# Take a peek at the data by converting the same to a Pandas dataframe
proj_df = proj_ds.to_pandas_dataframe()

# print the data
proj_df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,housemaid,married,basic.4y,no,no,yes,cellular,jul,mon,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.960,5228.1,no
32946,37,management,married,university.degree,no,no,yes,cellular,jul,fri,...,7,999,0,nonexistent,1.4,93.918,-42.7,4.957,5228.1,no
32947,26,admin.,single,university.degree,no,no,no,cellular,may,tue,...,4,999,1,failure,-1.8,92.893,-46.2,1.266,5099.1,no
32948,31,blue-collar,single,basic.9y,no,no,no,cellular,apr,mon,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no


In [11]:
proj_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,40.040212,257.335205,2.56173,962.17478,0.17478,0.076228,93.574243,-40.51868,3.615654,5166.859608
std,10.432313,257.3317,2.763646,187.646785,0.496503,1.572242,0.578636,4.623004,1.735748,72.208448
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,179.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,318.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [12]:
# Next - prepare for the AutoML experiment
# Essentially, an AutoML configuration needs to be set up
# That will have all the information required to train and
# produce a model

In [13]:
# THat's where we will start - with specifying an AutoMLConfig
automl_config = AutoMLConfig(
    task='classification',
    path=PROJECT_FOLDER,
    iterations=20,
    primary_metric='AUC_weighted',
    compute_target=project_cc,
    n_cross_validations=3,
    featurization='auto',
    max_concurrent_iterations=5,
    experiment_timeout_hours=0.5,
    enable_early_stopping=True,
    model_explainability=True,
    debug_log=PROJECT_DEBUG_LOG,
    training_data=proj_ds,
    label_column_name=PROJECT_LABEL_COLUMN_NAME,
)

In [14]:
# Submit/Run this experiment
automl_run = proj_experiment_step2.submit(
    config=automl_config,
    show_output=True
)

automl_run.wait_for_completion(show_output=True)


Submitting remote run.
No run_configuration provided, running on CPU-CC with default configuration
Running on remote compute: CPU-CC


Experiment,Id,Type,Status,Details Page,Docs Page
exp-project2-step2,AutoML_4ec0fe98-668d-4b7c-8431-f79c0fbfca8b,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of 

Experiment,Id,Type,Status,Details Page,Docs Page
exp-project2-step2,AutoML_4ec0fe98-668d-4b7c-8431-f79c0fbfca8b,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |yes                              |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_4ec0fe98-668d-4b7c-8431-f79c0fbfca8b',
 'target': 'CPU-CC',
 'status': 'Completed',
 'startTimeUtc': '2021-08-05T21:03:22.697534Z',
 'endTimeUtc': '2021-08-05T21:16:42.334083Z',
 'properties': {'num_iterations': '20',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'CPU-CC',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"8ba41e22-1586-43ea-995a-d8c96f292969\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.32.0", "azureml-train": "1.32.0", "azureml-train-restclients-hyperdrive": "1.32.0", "azureml-train-core": "1.32.0", "azureml-train-automl": "1.32.0", "azureml-train-automl-runtime": "1.32.0", "azureml-train-automl-client": "1.32.0", "azureml-

$Step$ $2$
# Capture appropriate screenshots for README

# End of Step 2 - Automated ML Experiment of the project

$Step$ $3$
# Start of Step 3 - Deploy the Best Model
Per various knowledge base articles - this is done inthe AutoML Studdio and not programmatically
Note that the deployed model will be associated with a name (like 'bank-term-deposit') and the name used will be reeferenced below
to turn on Application Insights

# End of Step 3

$Step$ $4$
# Start of Step 4 - Enable Logging (Application Insights)

In [16]:
# To enable ApplicationInsights on the service (webservice), 
# * first access the endpoint using the name assigned at the time of deployment
# * next update webservice parameters such as enabling application insights (enable_app_insights)

proj_webservice = Webservice(
    workspace = ws,
    name=PROJECT_DEPLOYED_MODEL_NAME
)

proj_webservice.update(
    enable_app_insights=True
)

# At this point application insights (logging is enabled) and can be
# checked in the GUI in AutoML studio


$Step$ $4$

# Next,
Run the provided logs.py (making the appropriate changes to the code to reference the correct URI) Ensure logs are displayed

# Capture appropriate screenshots for README
# End of Step 4 - Enable Logging (applicationInsights)

$Step$ $5$
# Start of Step 5 - Swagger Documentation

Step 5 is not implemented as part of this notebook. It is executed independently by
running the scripts as requested

# Capture appropriate screenshots for README

# End of Step 5 - Swagger Documentation

$Step$ $6$
# Start of Step 6 - Consume Model Endpoints
Step 6 also is not part of this notebook. The project instructions are to be followed and the
independent scripts use to complete this step.import

# Capture appropriate screenshots for README
# End of Step 6 - Consume Model endpoints

$Step$ $7$
# Start of Step 7 - Create, Publish and Consume a Pipeline
Step 7 is implemented afresh and is not being run from the Udacity provided notebook


$Step$ $7$
# Create a Pipeline

In [17]:
# Next - prepare for the AutoML experiment
# Essentially, a pipeline is created and then submitted to be run

# Creating the pipeline involves
# * identifying the steps
# * identifying the outputs
#
# Identify the pipeline steps:
# Identifyingthe pipeline steps in turn involves specifing an AUtoMLStep
# An AutoMLStep in turn is associated with the AutoML configuration

In [18]:
# The above AutoML config feeds to the AutoMLStep.
# The AutoMLStep needs outputs to be defined as well before the step iteslf can
# be specified
# Let's set up the two types of outputs - metrics and model
#
metrics_output = PipelineData(
    name=PROJECT_PIPELINEDATA_METRICS_NAME,
    datastore=ds,
    pipeline_output_name=PROJECT_PIPELINE_OUTPUT_METRICS_NAME,
    training_output=TrainingOutput(type='Metrics')
)

model_output = PipelineData(
    name=PROJECT_PIPELINEDATA_MODEL_NAME,
    datastore=ds,
    pipeline_output_name=PROJECT_PIPELINE_OUTPUT_MODEL_NAME,
    training_output=TrainingOutput(type='Model')
)

In [32]:
# Next - build up the AutoMLStep
automl_step=AutoMLStep(
    name=PROJECT_AUTOMLSTEP_NAME,
    automl_config=automl_config,
    ouputs=[metrics_output, model_output],
    allow_reuse=False
)

In [33]:
# Now that we have the step and outputs specfied - we are ready
# to prime the pipeline

proj_pipeline = Pipeline(
    workspace=ws,
    steps=automl_step,
    description=PROJECT_PIPELINE_DESCRIPTION,
    default_datastore=ds
)


In [34]:
# To perform teh training, the pipeline needs to be run
# That is accomplished by submitting a run
# NOte that any applicable pipeline parameers will be specified here
# The output of the submit is the PipelineRun object which may be queried
# for the run satus


proj_pipeline_run = proj_pipeline.submit(
    experiment_name=PROJECT_EXPERIMENT_NAME_STEP7
)

Created step AutoML Training Step [7fde48bf][5876b64e-27bc-4608-838e-8071cac70603], (This step will run and generate new outputs)
Submitted PipelineRun 3c1ebbd1-a006-45d4-980d-3d5ccaaf297e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3c1ebbd1-a006-45d4-980d-3d5ccaaf297e?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-153756/workspaces/quick-starts-ws-153756&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254


In [35]:
# use the RunDetails widget to gwt the details regarding the run in real time
# Note that the display is updatd asyncronously and control is returned back to run the subsequent code
RunDetails(proj_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [36]:
# Wait for the pipeline run to complte before we do anything
 
# Once complete, the staus is return and assert on the status of Finished
assert(proj_pipeline_run.wait_for_completion(show_output=True) == 'Finished')

# An AutoML training Pipeline run on the bank marketing data is completed

PipelineRunId: 3c1ebbd1-a006-45d4-980d-3d5ccaaf297e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3c1ebbd1-a006-45d4-980d-3d5ccaaf297e?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-153756/workspaces/quick-starts-ws-153756&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
PipelineRun Status: Running


StepRunId: 88cce115-5026-4f5f-b44f-f794bf29b755
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/88cce115-5026-4f5f-b44f-f794bf29b755?wsid=/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourcegroups/aml-quickstarts-153756/workspaces/quick-starts-ws-153756&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
StepRun( AutoML Training Step ) Status: NotStarted
StepRun( AutoML Training Step ) Status: Running

StepRun(AutoML Training Step) Execution Summary
StepRun( AutoML Training Step ) Status: Finished



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '3c1ebbd1-a006-45d4-980d-3d5ccaaf297e', 'status': 'Comple

$Step$ $7$
# Publish a Pipeline

In [24]:
# Presumably, at some later time, we may need to run through the pipeline again
# Maybe the data has changed
# Maybe there are different algorithms at this point in time
# To facilitate this, the pipeline must first be published
proj_published_pipeline = proj_pipeline_run.publish_pipeline(
    name=PROJECT_PUBLISHED_PIPELINE_NAME,
    description=PROJECT_PUBLISHED_PIPELINE_DESCRIPTION,
    version=PROJECT_PUBLISHED_PIPELINE_VERSION
)


$Step$ $7$
# Consume a Pipeline
Effectively Run the pipeline again

In [25]:
# Once a pipeline is published - it may be consumed or run again

# Get a handle to the publlished pipeline
# Not ethat we will not use the handle obtained at the time of publishing the pipeline
# Presumably - this code is run much later and is disjointed from the publishing code

# Retrieve the pipeline - we have teh published pipeline name
# To start with, retrieve the list of published pipelines
ppls = PublishedPipeline.list(workspace=ws)
ppl_found = False
for ppl in ppls:
    if ppl.name == PROJECT_PUBLISHED_PIPELINE_NAME:
        ppl_found = True
        break
    
if not ppl_found:
    print(f'{PROJECT_PUBLISHED_PIPELINE_NAME} not found')

print(f'Published Pipeline {PROJECT_PUBLISHED_PIPELINE_NAME} found')

# Locate the endpoint
endpoint = ppl.endpoint

print(f'Published pipeline endpoint is {endpoint}')

Published Pipeline Marketing Bank Training Pipeline found
Published pipeline endpoint is https://southcentralus.api.azureml.ms/pipelines/v1.0/subscriptions/f9d5a085-54dc-4215-9ba6-dad5d86e60a0/resourceGroups/aml-quickstarts-153756/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-153756/PipelineRuns/PipelineSubmit/a3c55788-1d24-4569-b4b3-002fb4503a03


In [26]:
auth_header = InteractiveLoginAuthentication().get_authentication_header()


In [28]:
response = requests.post(
    url=endpoint,
    headers=auth_header,
    json={"ExperimentName":f'{PROJECT_CONSUME_PIPELINE_ENDPOINT_EXPERIMENT}'}
)

In [29]:
# Go through the reponse
try:
    response.raise_for_status()
except Exception:
    raise Exception(f'Received bad response from the endpoint {endpoint}\nResponse code: {response.status_code}\nResponse content: {response.content}\nResponse Headers: {response.headers}')

# Get the pipeline run IOd from teh response
proj_endpoint_consume_run_id = response.json().get('Id')
print(f'Pipeline invocation run id: {proj_endpoint_consume_run_id}')

Pipeline invocation run id: d74dde6f-45c5-46d1-85d2-70563fe92819


In [31]:
# With the run_id - we can now retrieve the pipeline run object and
# display the progress using RunDetails
proj_endpoint_pipeline_run = PipelineRun(
    experiment=ws.experiments[PROJECT_CONSUME_PIPELINE_ENDPOINT_EXPERIMENT],
    run_id=proj_endpoint_consume_run_id
)

RunDetails(proj_endpoint_pipeline_run).show()


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

$Step$ $7$

# Capture appropriate screenshots for README

# End of Step 7 - Create, Publish and Consume a Pipeline