# Hyperdrive Run

In [1]:
# configure workspace and the experiment
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="quick-starts-ws-125537")
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-125892
Azure region: southcentralus
Subscription id: 0c5a644d-c5ce-4e3b-bf42-4cb265317817
Resource group: aml-quickstarts-125892


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

amlcompute_cluster_name = "cpu-cluster"

try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2", max_nodes=4)
    
    # create the cluster
    aml_compute = ComputeTarget.create(workspace=ws,
                                       name=amlcompute_cluster_name, 
                                       provisioning_configuration=compute_config)


aml_compute.wait_for_completion(show_output=True)

print(aml_compute.get_status().serialize())

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-11-13T04:06:11.750000+00:00', 'errors': None, 'creationTime': '2020-11-13T04:06:08.916549+00:00', 'modifiedTime': '2020-11-13T04:06:24.917195+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
# import relevant packages and libraries
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil
from azureml.core import Environment

# Specify parameter sampler
ps = RandomParameterSampling({
    '--C': uniform(0.001, 1.0),
    '--max_iter': choice(0, 10, 50, 100, 150, 200)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=3, slack_factor=0.1, delay_evaluation=3)

if "outputs" not in os.listdir():
    os.mkdir("./outputs")

train_script = "./outputs"
# Create a SKLearn estimator for use with train.py
# Copy training script into train_model directory
shutil.copy('train.py', train_script)

est = SKLearn(
    source_directory=train_script,
    compute_target=aml_compute,
    entry_script='train.py',
    framework_version='0.20.3'
    )

                                                    
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=4
)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_adc1474e-e550-4576-9d8f-ce17b38f5351
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_adc1474e-e550-4576-9d8f-ce17b38f5351?wsid=/subscriptions/0c5a644d-c5ce-4e3b-bf42-4cb265317817/resourcegroups/aml-quickstarts-125892/workspaces/quick-starts-ws-125892

Execution Summary
RunId: HD_adc1474e-e550-4576-9d8f-ce17b38f5351
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_adc1474e-e550-4576-9d8f-ce17b38f5351?wsid=/subscriptions/0c5a644d-c5ce-4e3b-bf42-4cb265317817/resourcegroups/aml-quickstarts-125892/workspaces/quick-starts-ws-125892



{'runId': 'HD_adc1474e-e550-4576-9d8f-ce17b38f5351',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-13T04:15:50.41092Z',
 'endTimeUtc': '2020-11-13T04:24:03.681236Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '8256c53c-41c6-4357-b3f1-d93647ef50d1',
  'score': '0.91442097596504',
  'best_child_run_id': 'HD_adc1474e-e550-4576-9d8f-ce17b38f5351_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg125892.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_adc1474e-e550-4576-9d8f-ce17b38f5351/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=vO7Tu75LkPbSH9VyJbabbSZEo3PohkCsOD%2BBKmZx65Y%3D&st=2020-11-13T04%3A14%3A23Z&se=2020-11-13T12%3A24%3A23Z&sp=r'}}

In [6]:
# evaluate the the run is indeed complete
assert(hyperdrive_run.get_status() == "Completed")

In [15]:
# get the best run
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

['--C', '0.80310034144554', '--max_iter', '200']


In [16]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_72084ec6073b43b604692f7752b6c1719cb2a5069a522b53d04700be8b424c67_d.txt', 'azureml-logs/65_job_prep-tvmps_72084ec6073b43b604692f7752b6c1719cb2a5069a522b53d04700be8b424c67_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_72084ec6073b43b604692f7752b6c1719cb2a5069a522b53d04700be8b424c67_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/100_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


In [17]:
# download the best run and register the model
best_run.download_file(name='outputs/model.joblib', output_file_path='./outputs')
model = best_run.register_model(model_name='hyperdrive_run', model_path='outputs/model.joblib')

----------------------------

--------------------------------------------------------------

# AutoML Run

In [19]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

dataset = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [20]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

In [21]:
dataset.take(3).to_pandas_dataframe()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no


In [22]:
# split data into Train and Test Sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [23]:
# combine the training features and the label
import pandas as pd
train_df = pd.concat([x_train, y_train.to_frame(name="label")], axis=1)
validation_df = pd.concat([x_test, y_test.to_frame(name="label")], axis=1)

In [24]:
from azureml.core.dataset import Dataset

if not os.path.isdir('automl_data'):
    os.mkdir('automl_data')
    
# Save the train and validation data to a csv to be uploaded to the datastore
train_df.to_csv("automl_data/train_data.csv", index=False)
validation_df.to_csv("automl_data/validation_data.csv", index=False)


ds = ws.get_default_datastore()
ds.upload(src_dir='./automl_data', target_path='bankmarketing', overwrite=True, show_progress=True)

# Upload the training data as a tabular dataset for access during training on remote compute
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))
# validation_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))

label = "label"

Uploading an estimated of 2 files
Uploading ./automl_data/train_data.csv
Uploaded ./automl_data/train_data.csv, 1 files out of an estimated total of 2
Uploading ./automl_data/validation_data.csv
Uploaded ./automl_data/validation_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [25]:
train_data.take(3).to_pandas_dataframe()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,label
0,31,0,0,1,1,5,1,232,5,999,...,1,0,0,0,0,0,0,1,0,0
1,39,1,0,1,0,5,5,127,1,999,...,1,0,0,0,0,0,0,1,0,0
2,48,0,0,1,0,7,2,654,1,999,...,0,0,0,0,0,0,0,1,0,0


In [26]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name="label",
    compute_target=aml_compute,
    n_cross_validations=5)

In [27]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)

Running on remote.
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_763f64c0-d7f9-466f-8453-5bb7df788497

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--

In [28]:
# Retrieve and save your best automl model.
best_run, fitted_model = automl_run.get_output()

In [29]:
# save the model to local environment on azure
import joblib

filename = 'automl_model.joblib'
joblib.dump(fitted_model, './outputs/' + filename)

['./outputs/automl_model.joblib']

In [30]:
# save the scoring script
model_name = best_run.properties['model_name']

script_file_name = 'inference/score.py'

best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'inference/score.py')

In [31]:
# register the model in Azure Workspace
description = 'AutoML trained model'
tags = None
model = automl_run.register_model(model_name = model_name, description = description, tags = tags)

print(automl_run.model_id) # This will be written to the script file later in the notebook.

AutoML763f64c0d27


In [32]:
# Delete compute cluster
aml_compute.delete()

Current provisioning state of AmlCompute is "Deleting"

