In [1]:
from azureml.core import Workspace, Experiment

# ws = Workspace.get(name="udacity-project")

# Get the "default" workspace configuration that I did not specifically create
ws = Workspace.from_config()

# Create the experiment and name it exp-udacity-project
exp = Experiment(workspace=ws, name="exp-udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-152072
Azure region: southcentralus
Subscription id: 1b944a9b-fdae-4f97-aeb1-b7eea0beac53
Resource group: aml-quickstarts-152072


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.exceptions import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

# Use https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.computetarget?view=azure-ml-py as a reference
#
# Name the cluster GPUCluster
cluster_name = "GPUCluster"

# See if it already exists
# this ofcourse helps if you are reunning the notebook from teh start and do not
# need to recreate the compute cluster
try:
    compute_cluster = ComputeTarget(ws, cluster_name)
    # No exception thrown - Found it - use it below
except ComputeTargetException:
    # Did not find the compute target - will need to create one
    # Specify the compute cluster configuration first
    # See https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target
    # and after clicking on Dv2
    # see https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target

    # CPU cluster instance
    # cluster_config = AmlCompute.provisioning_configuration(vm_size = 'Standard_D2_v2',
    #                                                         max_nodes=4,
    #                                                         description='Compute Cluster created programatically')

    # cluster_config = AmlCompute.provisioning_configuration(vm_size = 'Standard_DS2_v2',
    #                                                         max_nodes=1,
    #                                                         description='Cheap Compute Cluster created programatically')


    # GPU cluster instance
    cluster_config = AmlCompute.provisioning_configuration(vm_size = 'Standard_NC6',
                                                            max_nodes=4,
                                                            description='Compute Cluster created programatically')


    # Next, create the cluster
    compute_cluster = ComputeTarget.create(ws, cluster_name, cluster_config)

# We have a compute cluster - either newly created - or created earlier

# We may wait for the create operation to complete
compute_cluster.wait_for_completion(show_output=True)


Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os
from azureml.core import Experiment

In [4]:
from pathlib import Path

In [5]:

# Do some preparatory work 
# namely - create the directory training
#        - create __init__.py under it
#        - move train.py to this directory

# Note that __init__.py may need to be execurted and/or this notebook kernel restarted 
# for the interpreter to find training.train an acceptable import

if "training" not in os.listdir():
    os.mkdir("./training")
    Path("./training/__init__.py").touch()

if "train.py" not in os.listdir("training"):
    os.rename("./train.py", "./training/train.py")


In [6]:
from azureml.core import ScriptRunConfig

In [7]:
from azureml.core.environment import Environment

In [8]:
from training.train import clean_data

In [9]:
# envs = Environment.list(workspace=ws)

# for env in envs:
#     if env.startswith("AzureML"):
#         print("Name",env)
#         print("packages", envs[env].python.conda_dependencies.serialize_to_string())

In [10]:

# Specify parameter sampler
# ps = ### YOUR CODE HERE ###

# Prepare to use teh Azure ML HyperDrive operational workflow
# Essentially, HyperDrive will
# - invoke the training script multiple times, each time with differnet ML hyper parameters (--C and --max_iter in this case)
# - the script will log the hyper parameters, the accuracy and the model (all instrumentation) for each invocation
#
# THe workflow is to
# - prepare the run configuration
# - prepare all the other parameters
# - "Submit" the experiment and let the HyperDrive pipeline do it's things
# - Determine the best run
# - Retrieve and register the model from the best run
# 
# A lot of rederence information may be found at 
# "Hyperparameter tuning a model with Azure Machine Learning" - https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters

# import normal and choice as they are used below in the parameter sampling

from azureml.train.hyperdrive.parameter_expressions import normal, choice

# from train.py - we know that the parameters are --C and --max_iter
# train.py uses the scikit-learn LogisticRegression model

ps = RandomParameterSampling({
    "--C": choice(0.001, 0.01, 0.1, 1, 10),
    "--max_iter": choice(100, 200, 300, 400, 500),
})
# Specify a Policy
#policy = ### YOUR CODE HERE ###

# BanditPolicy is a type of EarlyTerminationPolicy
# More info can be found at 
# https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.banditpolicy?view=azure-ml-py

policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=5)


# Create a SKLearn estimator for use with train.py
# est = ### YOUR CODE HERE ###
# est = SKLearn(
#     source_directory=".", # No sure about where this belongs
#     compute_target= compute_cluster, # or is it 'local'
#     entry_script="train.py")


# Create a SKLearn estimator for use with train.py
# est = ### YOUR CODE HERE ###
# est = SKLearn(
#     source_directory=".", # No sure about where this belongs
#     compute_target= compute_cluster, # or is it 'local'
#     entry_script="train.py")

# We need to specify/provide an environment
# We could create our own environment with corresponding yaml file
# We may also provide an environment 'out-of-the-box' from one of the many environments provided

curated_env_name = 'AzureML-Tutorial'
curated_env = Environment.get(workspace=ws, name=curated_env_name)

# SKLearn is deprecated - instead we specity the run configuration using ScriptRunConfig
# It is necessary to pass the directory, the python script, the compute cluster and the environment

run_config = ScriptRunConfig(
    source_directory="./training",
    script="train.py",
    compute_target=compute_cluster,
    environment=curated_env,
)


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
# hyperdrive_config = ### YOUR CODE HERE ###
hyperdrive_config = HyperDriveConfig(
    hyperparameter_sampling=ps,
    policy=policy,
    run_config=run_config,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
)

In [11]:
# hyperdrive_config = HyperDriveConfig(
#     hyperparameter_sampling=ps,
#     policy=policy,
#     run_config=run_config,
#     primary_metric_name='Accuracy',
#     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
#     max_total_runs=4,
# )

In [12]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
# from azureml.core import Experiment 

hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

As can be seen above, quite a few runs (combination of hyperparameters) provided an ** accuracy of 0.91365 **
<br>
As can be seen below, one of the input/argument combinations that lead to this accuracy (among others) are
** --C 10 --max_iter 400 **

In [13]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_8951fcb8-a0d9-43d5-9447-2d3008556991
Web View: https://ml.azure.com/runs/HD_8951fcb8-a0d9-43d5-9447-2d3008556991?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-152072/workspaces/quick-starts-ws-152072&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-07-21T22:11:23.031542][API][INFO]Experiment created<END>\n""<START>[2021-07-21T22:11:23.536132][GENERATOR][INFO]Trying to sample '20' jobs from the hyperparameter space<END>\n""<START>[2021-07-21T22:11:23.814538][GENERATOR][INFO]Successfully sampled '20' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_8951fcb8-a0d9-43d5-9447-2d3008556991
Web View: https://ml.azure.com/runs/HD_8951fcb8-a0d9-43d5-9447-2d3008556991?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-152072/workspaces/quick-starts-ws-152072&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

{
  "error": {
   

{'runId': 'HD_8951fcb8-a0d9-43d5-9447-2d3008556991',
 'target': 'GPUCluster',
 'status': 'Completed',
 'startTimeUtc': '2021-07-21T22:11:22.769931Z',
 'endTimeUtc': '2021-07-21T22:19:57.770102Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
   'message': '{\n  "error": {\n    "code": "UserError",\n    "severity": null,\n    "message": "User errors were found in at least one of the child runs.",\n    "messageFormat": null,\n    "messageParameters": {},\n    "referenceCode": null,\n    "detailsUri": null,\n    "target": null,\n    "details": [],\n    "innerError": null,\n    "debugInfo": null,\n    "additionalInfo": null\n  },\n  "correlation": null,\n  "environment": null,\n  "location": null,\n  "time": "0001-01-01T00:00:00+00:00",\n  "componentName": null\n}'}],
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal

In [14]:
assert(hyperdrive_run.get_status() == "Completed")

In [15]:
import joblib
# Get your best run and save the model from that run.


### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()

print(f'{best_run.get_details()["runDefinition"]["arguments"]}')
# print(best_run.get_details())


['--C', '10', '--max_iter', '400']


In [16]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
# Create the Azure ML dataset from the preferred source
# Note that thsi source is the same as used in train.py

ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


In [17]:
from training.train import clean_data

# Use the clean_data function to clean your data.
# x, y = clean_data(### YOUR DATA OBJECT HERE ###)
# clean the data - function already introcuced in train.py
x, y = clean_data(ds)



In [18]:
# Add x and y together
# save the new df to disk as a csv
# Upload to a datastore
# 
# load from datastore as an Azure TabularDataSet

# Add two pandas dataframes togethere
x['y'] = y


# save and reload teh clean data so that Azure ML can use it
# See https://stackoverflow.com/questions/60380154/upload-dataframe-as-dataset-in-azure-machine-learning


In [26]:
# To be able to load to datastore - the data needs to be in a folder.
# Thus first create the directory

# Create a directory if "my_data" not in os.listdir():
if "my_data" not in os.listdir():
    os.mkdir("./my_data")

In [20]:
# now save x to disk
x.to_csv('my_data/clean_data.csv')

In [21]:
# upload the file to the default datastore
datastore = ws.get_default_datastore()

In [22]:
datastore.upload(src_dir='my_data', target_path='my_data')

Uploading an estimated of 1 files
Uploading my_data/clean_data.csv
Uploaded my_data/clean_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_2b2e9d74bd3d4a9cb20e94f77ca7e696

In [23]:
# Now Create the dataset that will later be used for the ML Pipeline

clean_ds = TabularDatasetFactory.from_delimited_files(datastore.path('my_data/clean_data.csv'))


In [24]:
from azureml.train.automl import AutoMLConfig
import logging

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
# automl_config = AutoMLConfig(
#     experiment_timeout_minutes=30,
#     task=,
#     primary_metric=,
#     training_data=,
#     label_column_name=,
#     n_cross_validations=)

#TODO here
# From examples here - https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train
# this dataset discusses wehther a user may be interested in a term deosit or not
# It thus becomes a classification problem
# The metric used is accuracy - the same that is used by the LogisticRegression in train.py
# 
# 
automl_settings = {
    "experiment_timeout_minutes" : 15,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes" : 5,
    "max_concurrent_iterations" : 4,
    "max_cores_per_iteration" : -1,
    "n_cross_validations" : 2,
    # "primary_metric" : 'AUC_weighted',
    "primary_metric" : 'accuracy',
    "verbosity" : logging.INFO,
}

# automl_config = AutoMLConfig(
#     experiment_timeout_minutes=15,
#     compute_target = compute_cluster,
#     task='classification',
#     primary_metric='AUC_weighted',
#     training_data=clean_ds,
#     label_column_name='y',
#     n_cross_validations=3)    

# Provide the remainder of the settings/configuration
# Note that we are not providing a validation data set - and we may need to
# 
automl_config = AutoMLConfig(
    compute_target = compute_cluster,
    task='classification',
    training_data=clean_ds,
    label_column_name='y',
    **automl_settings)


In [25]:
# Submit your automl run

### YOUR CODE HERE ###
# From https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train again
#
automl_run = exp.submit(config=automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on GPUCluster with default configuration
Running on remote compute: GPUCluster


Experiment,Id,Type,Status,Details Page,Docs Page
exp-udacity-project,AutoML_1b2da470-8bfc-4857-9274-766c8d5288e0,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                         

In [1]:
# define print_model - from https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train

from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0]+ ' - ')
        elif hasattr(step[1], '_base_learners') and hasattr(step[1], '_meta_learner'):
            print("\nMeta Learner")
            pprint(step[1]._meta_learner)
            print()
            for estimator in step[1]._base_learners:
                print_model(estimator[1], estimator[0]+ ' - ')
        else:
            pprint(step[1].get_params())
            print()  

In [2]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, best_model = automl_run.get_output()

print(f'printing best run:\n{best_run}\n\nPrinting model:')

print_model(best_model)

NameError: name 'automl_run' is not defined

In [None]:
# Save and register the model