# Optimizing an ML Pipeline in Azure

In [1]:
import os

import pandas as pd

from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Dataset, Experiment
from azureml.core.authentication import InteractiveLoginAuthentication

from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.automl import AutoMLConfig
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform

from azureml.data.dataset_factory import TabularDatasetFactory

## Setup environment

In [2]:
# Run inside Azure Studio
# ws = Workspace.from_config()

# Run outside Azure Studio: 2 options:
# 1. using config.json
ws = Workspace.from_config(
      path='./config.json'
)

# 2. hard-coding info
# look for tenant id under Azure Active Directory Service
#interactive_auth = InteractiveLoginAuthentication(
#      tenant_id='660b3398-b80e-49d2-bc5b-ac1dc93b5254'
#)

#ws = Workspace.get(
#      name='quick-starts-ws-127280',
#      subscription_id='5e9e5b82-0802-460e-849e-48901a64399b',
#      resource_group='aml-quickstarts-127280'
#)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-127478
Azure region: southcentralus
Subscription id: de8aba62-c352-42be-b980-2faedf08ead8
Resource group: aml-quickstarts-127478


In [3]:
# Create compute cluster
compute_name = 'train-cluster'
vm_size = 'Standard_D2_V2'
max_nodes = 4

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target:', compute_name)
else:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size, 
        min_nodes=0, 
        max_nodes=max_nodes
    )

    # Create new compute target
    compute_target = ComputeTarget.create(
        ws, 
        compute_name, 
        provisioning_config
    )

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, 
        min_node_count=None, 
        timeout_in_minutes=20
    )

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-11-23T04:56:01.422000+00:00', 'errors': None, 'creationTime': '2020-11-23T04:55:59.314651+00:00', 'modifiedTime': '2020-11-23T04:56:14.726983+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


## Build and optimize a model by Azure AutoML

In [4]:
# Data is available at: 
data_path = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'

# Create TabularDataset using TabularDatasetFactory
ds = TabularDatasetFactory.from_delimited_files(data_path)

In [5]:
from script.train import clean_data

# Use the clean_data function to clean data.
x, y = clean_data(ds)

In [6]:
x.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0


In [7]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=31, stratify=y)

In [8]:
df_train = pd.concat([x_train, y_train], axis=1)

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 0 to 32949
Data columns (total 40 columns):
age                              32950 non-null int64
marital                          32950 non-null int64
default                          32950 non-null int64
housing                          32950 non-null int64
loan                             32950 non-null int64
month                            32950 non-null int64
day_of_week                      32950 non-null int64
duration                         32950 non-null int64
campaign                         32950 non-null int64
pdays                            32950 non-null int64
previous                         32950 non-null int64
poutcome                         32950 non-null int64
emp.var.rate                     32950 non-null float64
cons.price.idx                   32950 non-null float64
cons.conf.idx                    32950 non-null float64
euribor3m                        32950 non-null float64
nr.employed        

In [10]:
df_train.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Save Pandas train dataframe
df_train.to_csv('./data/cleaned_bankmarketing_train.csv')

In [12]:
# Upload train data to AzureML as AutoML requires real Dataset object to run on remote compute
datastore = ws.get_default_datastore()
datastore.upload_files(
    ['./data/cleaned_bankmarketing_train.csv'], 
    target_path='data',
    overwrite=True,
    show_progress=True
)

ds = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/cleaned_bankmarketing_train.csv'))])

Uploading an estimated of 1 files
Uploading ./data/cleaned_bankmarketing_train.csv
Uploaded ./data/cleaned_bankmarketing_train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [13]:
ds = ds.register(
    workspace=ws,
    name='Cleaned Bank Marketing',
    description='Cleaned bank marketing dataset')

In [14]:
# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',  # task can be one of classification, regression, forecasting
    primary_metric='accuracy',
    label_column_name='y',
    n_cross_validations=5,
    enable_voting_ensemble=False,
    enable_stack_ensemble=False,

    # run on local/run_compute
    #training_data=train_df 

    # # run on compute cluster
    training_data=ds,
    compute_target=compute_target,
    max_concurrent_iterations=max_nodes
)

In [15]:
# Create new experiment
exp = Experiment(workspace=ws, name='automl')

In [16]:
# Submit AutoML run
run = exp.submit(automl_config, show_output=True)

Running on remote.
Running on remote compute: train-cluster
Parent Run ID: AutoML_49deb76b-aff8-43f2-918b-fe7d865e3de8

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of th

In [17]:
# Wait for processing
RunDetails(run).show()

run
run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                                |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_49deb76b-aff8-43f2-918b-fe7d865e3de8',
 'target': 'train-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-23T05:01:18.603667Z',
 'endTimeUtc': '2020-11-23T05:42:41.83834Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'train-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"0a8eedc5-2e8a-46a3-a15c-3a5bfa0619c6\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/cleaned_bankmarketing_train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-127478\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"de8aba62-c352-42be-b980-2faedf08ead8

In [18]:
# Get best run and save the model from that run.
best_run, best_model = run.get_output()
if best_run is None:
    raise Exception('No best run was found')

print('Best run:', best_run)
print('Best model:', best_model)

best_run_metrics = best_run.get_metrics()
print('Best run metrics:', best_run_metrics)

best_run_details = best_run.get_details()
print('Best run details:', best_run_details)

print('Accuracy:', best_run_metrics['accuracy'])

print('Best run id:', best_run.id)
print('Best files:', best_run.get_file_names())

Best run: Run(Experiment: automl,
Id: AutoML_49deb76b-aff8-43f2-918b-fe7d865e3de8_55,
Type: azureml.scriptrun,
Status: Completed)
Best model: Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('StandardScalerWrapper',
                 <azureml...
                                   colsample_bytree=1, eta=0.2, gamma=0.1,
                                   learning_rate=0.1, max_delta_step=0,
                                   max_depth=6, max_leaves=3,
                       

In [19]:
if 'training' not in os.listdir():
    os.mkdir('./training')

# Save best automl model
best_run.download_file('outputs/model.pkl', './training/automl_best_model.pkl')

## Build Scikit-learn Logistic Regression model and optimize its hyperparamters using Azure HyperDrive

In [20]:
# Specify parameter sampler
ps = RandomParameterSampling({
    '--C': uniform(0.01, 100),
    '--max_iter': choice(range(10, 500))
})

# Specify a Policy: 
# the early termination policy is applied at every interval when metrics are reported, starting at evaluation interval 5. 
# Any run whose best metric is less than (1/(1+0.1) or 91% of the best performing run will be terminated 
policy = BanditPolicy(
    slack_factor=0.1, 
    evaluation_interval=1, 
    delay_evaluation=5
)

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory=os.path.join('.', 'script'), 
    entry_script='train.py',
    #script_params='',
    compute_target=compute_target,
    #pip_packages=['joblib']
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='accuracy',  # very important, case senstive log entry name 'accuracy' as it is defined in the train.py
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=50,
    max_concurrent_runs=max_nodes)

In [21]:
# Create new experiment
exp = Experiment(workspace=ws, name='hyperdrive')

# run = exp.start_logging()  # No need to execute experiment this way!!!

In [22]:
# Submit hyperdrive run to the experiment
run = exp.submit(hyperdrive_config, show_output=True)



In [23]:
 # Show run details with the widget
 RunDetails(run).show()

run
run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_910464cf-78be-44c0-8463-f590a31dd86c
Web View: https://ml.azure.com/experiments/hyperdrive/runs/HD_910464cf-78be-44c0-8463-f590a31dd86c?wsid=/subscriptions/de8aba62-c352-42be-b980-2faedf08ead8/resourcegroups/aml-quickstarts-127478/workspaces/quick-starts-ws-127478

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-23T05:53:16.606344][API][INFO]Experiment created<END>\n"<START>[2020-11-23T05:53:17.9498256Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-11-23T05:53:19.413687][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-23T05:53:19.741071][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-23T05:53:48.5288808Z][SCHEDULER][INFO]Scheduling job, id='HD_910464cf-78be-44c0-8463-f590a31dd86c_0'<END><START>[2020-11-23T05:53:48.5060436Z][SCHEDULER][INFO]Scheduling job, id='HD_910

{'runId': 'HD_910464cf-78be-44c0-8463-f590a31dd86c',
 'target': 'train-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-23T05:53:15.903485Z',
 'endTimeUtc': '2020-11-23T06:17:55.186033Z',
 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '6291570c-0e72-4781-8439-852fdac4c539',
  'score': '0.9128983308042489',
  'best_child_run_id': 'HD_910464cf-78be-44c0-8463-f590a31dd86c_34',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg127478.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_910464cf-78be-44c0-8463-f590a31dd86c/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=KvaOTT6rhwNxY13MFDZEfWugppkgq%2FPuN4CTD8wz%2FF4%3D&st=2020-11-23T06%3A07%3A55Z&se=2020-11-23T14%3A17%3A55Z&sp=r'}}

In [24]:
# Get best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()
if best_run is None:
    raise Exception('No best run was found')

best_run_metrics = best_run.get_metrics()
print('Best run metrics:', best_run_metrics)

print('Best run details:', best_run.get_details())
parameter_values = best_run.get_details()['runDefinition']['arguments']
print('Best run parameters:', parameter_values)

print('Accuracy:', best_run_metrics['accuracy'])
print('Inverse of regularization strength (', parameter_values[0], '):', parameter_values[1])
print('Maximum number of iterations (', parameter_values[2], '):', parameter_values[3])

print('Best run id:', best_run.id)
print('Best files:', best_run.get_file_names())

Best run metrics: {'Regularization Strength': 65.44776652593455, 'Max iterations': 217, 'accuracy': 0.9128983308042489}
Best run details: {'runId': 'HD_910464cf-78be-44c0-8463-f590a31dd86c_34', 'target': 'train-cluster', 'status': 'Completed', 'startTimeUtc': '2020-11-23T06:10:45.878114Z', 'endTimeUtc': '2020-11-23T06:11:29.821815Z', 'properties': {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnapshotId': '6291570c-0e72-4781-8439-852fdac4c539', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': 'train.py', 'useAbsolutePath': False, 'arguments': ['--C', '65.44776652593455', '--max_iter', '217'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'train-cluster', 'dataReferences': {}, 'data': {}, 'outputData': {}, 'jobName': None, 'maxRunDurationSeconds': None, 'nodeCount': 1, 'priority': None, 'environment': {'n

In [25]:
if 'training' not in os.listdir():
    os.mkdir('./training')

# Save the model from that run.
best_run.download_file('outputs/model.joblib', './training/hyperdrive_best_model.joblib')

In [26]:
# Delete compute cluster used for training
compute_target.delete()