### Connect to the AML workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.47.0 to work with mlw-test


### Check if the diabetes data asset already registered

In [2]:
from azureml.core import Dataset
default_ds = ws.get_default_datastore()

if 'Diabetes data asset' in ws.datasets:
    print('Daiabetes data asset already registered.')

Daiabetes data asset already registered.


### Create a folder for the training script

In [3]:
import os

experiment_folder = 'diabetes-training-hyperparameters'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


### Create a training script
Now create the Python script to train the model. In this example, you'll use a *Gradient Boosting* algorithm to train a classification model. The script must include:

- An argument for each hyperparameter you want to optimize (in this case, the learning rate and number of estimators for the Gradient Boosting algorithm)
- Code to log the performance metric you want to optimize for (in this case, you'll log both AUC and accuracy, so you can choose to optimize the model for either of these)

In [4]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# Get the experiment run context
run = Run.get_context()

# Get script arguments
parser = argparse.ArgumentParser()

# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

# Hyperparameters
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')

# Add arguments to args collection
args = parser.parse_args()

# Log Hyperparameter values
run.log('learning_rate',  np.float(args.learning_rate))
run.log('n_estimators',  np.int(args.n_estimators))

# load the diabetes dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a Gradient Boosting classification model with the specified hyperparameters
print('Training a classification model')
model = GradientBoostingClassifier(learning_rate=args.learning_rate,
                                   n_estimators=args.n_estimators).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes-training-hyperparameters/diabetes_training.py


### Create compute

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "compute-cluster-ds3-v2"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS3_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


### Create a conda runtime environment for the training

In [6]:
%%writefile $experiment_folder/diabetes_hyperparameter_env.yml
name: diabetes-hyperparameter-env
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

Overwriting diabetes-training-hyperparameters/diabetes_hyperparameter_env.yml


# Run a hyperparameter tuning experiment

Azure Machine Learning includes a hyperparameter tuning capability through *hyperdrive* experiments. These experiments launch multiple child runs, each with a different hyperparameter combination. The run producing the best model (as determined by the logged target performance metric for which you want to optimize) can be identified, and its trained model selected for registration and deployment.

> **Note**: In this example, we aren't specifying an early stopping policy. Such a policy is only relevant if the training script performs multiple training iterations, logging the primary metric for each iteration. This approach is typically employed when training deep neural network models over multiple *epochs*.

In [7]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
hyper_env = Environment.from_conda_specification("diabetes-hyperparameter-env", experiment_folder + "/diabetes_hyperparameter_env.yml")

# Get the training dataset
diabetes_ds = ws.datasets.get("Diabetes data asset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='diabetes_training.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', diabetes_ds.as_named_input('training_data')],
                                environment=hyper_env,
                                compute_target = training_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # Hyperdrive will try 6 combinations, adding these as script arguments
        '--learning_rate': choice(0.01, 0.1, 1.0),
        '--n_estimators' : choice(10, 100)
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, # No early stopping policy
                          primary_metric_name='AUC', # Find the highest AUC metric
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6, # Restict the experiment to 6 iterations
                          max_concurrent_runs=2) # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='train-diabetes-hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32',
 'target': 'compute-cluster-ds3-v2',
 'status': 'Completed',
 'startTimeUtc': '2023-02-01T09:05:20.661428Z',
 'endTimeUtc': '2023-02-01T09:24:01.017218Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"AUC","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '6eab1da9-639a-422d-bc51-25046eeb2a48',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1022-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.47.0',
  'space_size': '6',
  'score': '0.9885804604667666',
  'best_child_run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_3',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_3'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlCl

You can view the experiment run status in the widget above. You can also view the main Hyperdrive experiment run and its child runs in [Azure Machine Learning studio](https://ml.azure.com).

> **Note**: If a message indicating that a non-numeric can't be visualized is displayed, you can ignore it.

## Determine the best performing run

When all of the runs have finished, you can find the best one based on the performance metric you specified (in this case, the one with the best AUC).

In [8]:
# Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)

{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_3', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 100}', 'best_primary_metric': 0.9885804604667666, 'status': 'Completed'}
{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_5', 'hyperparameters': '{"--learning_rate": 1.0, "--n_estimators": 100}', 'best_primary_metric': 0.9857244419355496, 'status': 'Completed'}
{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_4', 'hyperparameters': '{"--learning_rate": 1.0, "--n_estimators": 10}', 'best_primary_metric': 0.982908128731084, 'status': 'Completed'}
{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_1', 'hyperparameters': '{"--learning_rate": 0.01, "--n_estimators": 100}', 'best_primary_metric': 0.9559393638830617, 'status': 'Completed'}
{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d0597c5a32_2', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 10}', 'best_primary_metric': 0.9516323866285732, 'status': 'Completed'}
{'run_id': 'HD_a4cf771f-1120-40f0-b16e-b2d

Now that you've found the best run, you can register the model it trained.

In [9]:
from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes-model',
                        tags={'Training context':'Hyperparemeter Tuning'},
                        properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

diabetes-model version: 8
	 Training context : Hyperparemeter Tuning
	 AUC : 0.9885804604667666
	 Accuracy : 0.9457777777777778


diabetes-model version: 7
	 Algorithm : Decision Tree Classifier
	 Training context : Pipeline
	 AUC : 0.8832546157718848
	 Accuracy : 0.898


diabetes-model version: 6
	 Algorithm : Logistic Regression
	 Training context : Tabular diabetes data asset
	 AUC : 0.8568650620553335
	 Accuracy : 0.7893333333333333
	 Regularization Rate : 0.1


diabetes-model version: 5
	 Algorithm : Decision Tree Classifier
	 Training context : Pipeline
	 AUC : 0.88375696004516
	 Accuracy : 0.8986666666666666


diabetes-model version: 4
	 Algorithm : Logistic Regression
	 Training context : Tabular diabetes data asset
	 AUC : 0.8568650620553335
	 Accuracy : 0.7893333333333333
	 Regularization Rate : 0.1


diabetes-model version: 3
	 Training context : Tabular diabetes data asset
	 AUC : 0.8568650620553335
	 Accuracy : 0.7893333333333333
	 Regularization Rate : 0.1


diabetes-mode