In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-131421
Azure region: southcentralus
Subscription id: 610d6e37-4747-4a20-80eb-3aad70a55f43
Resource group: aml-quickstarts-131421


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found an existing compute target.')
except ComputeTargetException:
    print('No existing compute target found. Creating a new compute target.')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

print(compute_target.get_status().serialize())

No existing compute target found. Creating a new compute target.
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-12-22T23:33:51.833000+00:00', 'errors': None, 'creationTime': '2020-12-22T23:33:45.222485+00:00', 'modifiedTime': '2020-12-22T23:34:01.061548+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, normal, uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
     {
        '--C': choice(1,2,4), 
        '--max_iter': choice(10,50,100)
     }
)

# Specify a Policy
policy = BanditPolicy(
           slack_factor = 0.1,
           evaluation_interval = 2
)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create an SKLearn estimator for use with train.py
est = SKLearn(source_directory='./', compute_target=compute_target, entry_script='train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=40,
                             max_concurrent_runs=4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit the hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_c4540035-fead-4ce8-bf69-5b385a9a7239
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_c4540035-fead-4ce8-bf69-5b385a9a7239?wsid=/subscriptions/610d6e37-4747-4a20-80eb-3aad70a55f43/resourcegroups/aml-quickstarts-131421/workspaces/quick-starts-ws-131421

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-12-22T23:36:03.255756][API][INFO]Experiment created<END>\n""<START>[2020-12-22T23:36:03.717067][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2020-12-22T23:36:04.1365591Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-12-22T23:36:04.013311][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_c4540035-fead-4ce8-bf69-5b385a9a7239
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_c4540035-fead-4ce8-bf69-5b385a9a7239?wsid=/subscriptions/610d6

{'runId': 'HD_c4540035-fead-4ce8-bf69-5b385a9a7239',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-22T23:36:02.931935Z',
 'endTimeUtc': '2020-12-22T23:45:32.940607Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '7e322bf7-f064-40dc-9485-5e534cfdb01c',
  'score': '0.9074355083459787',
  'best_child_run_id': 'HD_c4540035-fead-4ce8-bf69-5b385a9a7239_1',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg131421.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_c4540035-fead-4ce8-bf69-5b385a9a7239/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=bPKCEyeEMQfqEszZ4pTLDcykykKVVyd2ifajukMt2Ig%3D&st=2020-12-22T23%3A35%3A55Z&se=2020-12-23T07%3A45%3A55Z&sp=r'}}

In [5]:
import joblib

# Get the best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best run:', best_run)
print('Metrics:', best_run_metrics)

model = best_run.register_model(model_name='hyperdrive_model', model_path='./outputs/model.pkl')

Best run: Run(Experiment: udacity-project,
Id: HD_c4540035-fead-4ce8-bf69-5b385a9a7239_1,
Type: azureml.scriptrun,
Status: Completed)
Metrics: {'Regularization Strength:': 2.0, 'Max iterations:': 50, 'Accuracy': 0.9074355083459787}


In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(data_url)

In [7]:
import pandas as pd
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

# Join x and y since automl takes a dataset
dataset = x.join(y)

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
training_data = TabularDatasetFactory.register_pandas_dataframe(dataset, datastore, "training_data")



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/cdc9b92e-d87d-4a5d-8bea-62156518bd4b/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target=compute_target,
    experiment_timeout_minutes = 30,
    task = 'classification',
    primary_metric = 'accuracy',
    training_data = training_data,
    label_column_name = 'y',
    n_cross_validations = 4)

In [9]:
print(type(training_data))

<class 'azureml.data.tabular_dataset.TabularDataset'>


In [10]:
# Submit the automl run

experiment = Experiment(ws, "automl_model")
print("Experiment created")

automl_run = experiment.submit(config=automl_config, show_output=True)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Experiment created
Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_238968b1-ea32-4a49-ac00-fcc6c5698a45

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+------

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                                |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_238968b1-ea32-4a49-ac00-fcc6c5698a45',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-22T23:53:39.201225Z',
 'endTimeUtc': '2020-12-23T00:36:58.370731Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"38609501-65ba-438c-b368-106e599bf541\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"managed-dataset/cdc9b92e-d87d-4a5d-8bea-62156518bd4b/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-131421\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"610d6e37-4747-4a20-80e

In [13]:
# Retrieve and save the best automl model

best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

joblib.dump(value=fitted_model, filename="fitted_automl_model.joblib")

automl_model = best_run.register_model(model_name='automl_model.pkl', 
                                       model_path = './outputs/')

Run(Experiment: automl_model,
Id: AutoML_238968b1-ea32-4a49-ac00-fcc6c5698a45_24,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                intercept_scaling=1,
                                                                                                l1_ratio=None,
                                      

In [15]:
# Delete the cluster instance
AmlCompute.delete(compute_target)

Current provisioning state of AmlCompute is "Deleting"

