# Hyperparameter Tuning using HyperDrive

### 1. Import Dependencies:

In [29]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive import TruncationSelectionPolicy
from azureml.train.hyperdrive import BayesianParameterSampling

from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


### 2. Initialize Workspace

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-137441
aml-quickstarts-137441
southcentralus
9b72f9e6-56c5-4c16-991b-19c652994860


### 3. Initialize Experiment

In [3]:
ws = Workspace.from_config()
experiment_name = 'hyperdrive-experiment'
experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
hyperdrive-experiment,quick-starts-ws-137441,Link to Azure Machine Learning studio,Link to Documentation


### 4. Create Compute Cluster

In [4]:
cpu_cluster_name = "cpu-cluster"
vm_size='STANDARD_D2_V2'

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# Can poll for a minimum number of nodes and for a specific timeout. 
# If no min node count is provided it uses the scale settings for the cluster.
compute_target.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### 5. Dataset



In [5]:
data = datasets.load_breast_cancer()
print(data.data.shape)
print(data.feature_names)
print(data.DESCR)

(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - peri

In [6]:
pd.Series(data.target).value_counts(normalize=True)

1    0.627417
0    0.372583
dtype: float64

In [30]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df['target']=data.target
df.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## Hyperdrive Configuration

### Why Bayesian Sampling: 

Bayesian sampling is based on the Bayesian optimization algorithm. It picks samples based on how previous samples performed, so that new samples improve the primary metric.

Bayesian sampling is recommended if you have enough budget to explore the hyperparameter space. For best results, we recommend a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned.

The number of concurrent runs has an impact on the effectiveness of the tuning process. A smaller number of concurrent runs may lead to better sampling convergence, since the smaller degree of parallelism increases the number of runs that benefit from previously completed runs.

### Early Stopping: 
Early stopping Policy is not implemented for Bayesian Sampling is not implemented for Hyperdrive

In [20]:
# Specify parameter sampler
param_sampling = BayesianParameterSampling(
    parameter_space ={
        '--n_estimators' : choice(1,10,20,50,100,200,500),
        '--max_depth': choice(1, 5, 10, 20, 30, 50, 100),
        '--learning_rate': choice(1, 0.1, 0.01, 0.001)
        }
)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
estimator = SKLearn(source_directory = "./",
            compute_target=compute_target,
            entry_script="train.py")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_run_config = HyperDriveConfig(hyperparameter_sampling=param_sampling, 
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     estimator=estimator,
                                     max_total_runs=80,
                                     max_concurrent_runs=4)



## Run Details

* We trained `GradientBoostingClassifier` model from sklearn with different values of parameters mentioned in above code. GradientBoosting based classifier has been shown significant results for many classfication problem. It is considered as powerful algorithm for classfication. It is build on top of DecisionTree Algorithm.

* Model is suppose to give different results for all the combination of parameters. We will select best performing model. 

In [21]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

# Start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_run_config)

# Monitor HyperDrive runs You can monitor the progress of the runs with the following Jupyter widget
RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [22]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_0012b2d3-6a85-4b13-b71b-51329253e8b6
Web View: https://ml.azure.com/experiments/hyperdrive-experiment/runs/HD_0012b2d3-6a85-4b13-b71b-51329253e8b6?wsid=/subscriptions/9b72f9e6-56c5-4c16-991b-19c652994860/resourcegroups/aml-quickstarts-137441/workspaces/quick-starts-ws-137441

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-06T11:15:14.000735][API][INFO]Experiment created<END>\n""<START>[2021-02-06T11:15:14.506990][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-06T11:15:14.794366][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-06T11:15:15.1344551Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_0012b2d3-6a85-4b13-b71b-51329253e8b6
Web View: https://ml.azure.com/experiments/hyperdrive-experiment/runs/HD_0012b2d3-6a85-4b13-b71b-51329253e8b6?wsid=/subscri

{'runId': 'HD_0012b2d3-6a85-4b13-b71b-51329253e8b6',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-06T11:15:13.873747Z',
 'endTimeUtc': '2021-02-06T11:50:07.024471Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'ecd88f96-9e30-4de6-864d-7cf857ca4da9',
  'score': '0.9790209790209791',
  'best_child_run_id': 'HD_0012b2d3-6a85-4b13-b71b-51329253e8b6_56',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg137441.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_0012b2d3-6a85-4b13-b71b-51329253e8b6/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=4SxyBrRT5RK0YR%2Ftm2mbLqq1bzE4MrHgQNuauryA4U4%3D&st=2021-02-06T11%3A40%3A14Z&se=2021-02-06T19%3A50%3A14Z&sp=r'},
 'submittedBy': 'ODL_User 137441'}

## Best Model

Get the best model from the hyperdrive experiments and display all the properties of the model.

In [28]:
from azureml.core.model import Model

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()['Accuracy']
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('Accuracy:', best_run_metrics)
print('learning_rate:',parameter_values[1])
print('max_depth:',parameter_values[3])
print('n_estimators:',parameter_values[5])

Best Run Id:  HD_0012b2d3-6a85-4b13-b71b-51329253e8b6_56
Accuracy: 0.9790209790209791
learning_rate: 0.1
max_depth: 1
n_estimators: 500


In [27]:
parameter_values

['--learning_rate', '0.1', '--max_depth', '1', '--n_estimators', '500']

In [26]:
#TODO: Save the best model
best_run.download_file("/outputs/model.joblib", "Hyperdrive.joblib")

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service