# Hyperparameter Tuning using HyperDrive

In [8]:
import joblib
import numpy as np

from azureml.core import Dataset, Workspace, Experiment, Model
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.model import InferenceConfig
from azureml.widgets import RunDetails
from azureml.core.webservice import AciWebservice, Webservice
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, randint
from azureml.core import Environment, ScriptRunConfig
import os

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'heart-failure-project-hyperdrive'

exp = Experiment(ws, experiment_name)

In [4]:
cluster_name = "heart-failure-compute"

# Verfiy that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

The Sleep Health and Lifestyle Dataset from Kaggle is used to perform a classification task. The data covers a wide range of variables related to sleep and daily habits. In the classification task it should be determined wether a person has a certain sleep disorder or none.

In [5]:
dataset = Dataset.get_by_name(ws, name='Heart-Failure-Dataset')
df = dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## Hyperdrive Configuration

We utilize sklearn's GradientBoostingClassifier Class to establish and fit the model. Thereby, we define a parameter sampler to tune the hyperparameters, specifically the learning rate and number of estimators. Additionally, random parameter sampling was used, because it is an efficient, exploratory, and parallelizable method for hyperparameter tuning, which can potentially find better hyperparameters than other search methods. It is also robust to noise and other sources of variability in the training process. Additionally, a bandit policy was applied, since it is a popular early termination policy used in hyperparameter tuning that aims to save computational resources by terminating poorly performing runs early.

With the estimator, parameter sampler, and an early termination policy, we create a HyperDrive Config, which is subsequently submitted as an experiment. 

In [11]:
# Specify parameter sampler
ps = RandomParameterSampling({
    "learning_rate": choice(0.01, 0.05, 0.1, 0.3, 1),
    "n_estimators": choice(1, 5, 10, 25, 55, 99)
})

# Specify a Policy
policy = BanditPolicy(
    evaluation_interval=3,
    slack_factor=0.2)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(
    source_directory="./",
    script="train.py",
    compute_target=cluster,
    environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    run_config=src,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=20,
    max_concurrent_runs=3)

In [12]:
hyperdrive_run = exp.submit(hyperdrive_config)

## Run Details

In [13]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

In [14]:
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()

# Get the best run's metrics
best_run_metrics = best_run.get_metrics()

print(best_run.id)
print(best_run_metrics)

HD_9f32cec3-94a2-42ae-9e41-71a18f17e2bf_2
{'Learning Rate:': 0.3, 'Number Estimators:': 25, 'Accuracy': 0.8}


In [19]:
# Register best model
model = best_run.register_model(model_path="trained_model.pkl",model_name="best_model_gradient_boosting",
                               properties={
                                   "Accuracy": best_run_metrics["Accuracy"],
                                   "Number Estimators": best_run_metrics["Number Estimators:"],
                                   "Learning Rate": best_run_metrics["Learning Rate:"]
                               })