# Hyperparameter Tuning using HyperDrive

In [1]:
!pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (0.24.2)


In [2]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.datapath import DataPath

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.33.0


## Dataset

In [3]:
ws = Workspace.from_config()
# A different experiment name will be used in order to differentiate it from the automl experiment
experiment_name = 'hyperdrive_h2o_potability'
project_folder = "hyperdrive_h2o_potability"

experiment=Experiment(ws, experiment_name)

In [4]:
# This cell will either retrieve or create/register the dataset if not already existent on the AzureML Workspace
# For this code to work, the data should have been placed in a "data" folder alongside this Jupyter Notebook
# The water_potability.csv file was downloaded from Kaggle
found = False
key = "water_potability"
description_text = "Water Potability Dataset from Kaggle"

if key in ws.datasets.keys(): 
        print("Dataset already registered!")
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        print("Creating and registering dataset in Workspace")
        default_ds = ws.get_default_datastore()
        default_ds.upload(src_dir="data", target_path="data")
        dataset = Dataset.Tabular.from_delimited_files(default_ds.path("data/water_potability.csv"))     
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Dataset already registered!


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


## Compute Configuration

In [5]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
cluster_name = "h2o-pred-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           min_nodes=10, 
                                                           max_nodes=10)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Environment Configuration

In [6]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.9
- scikit-learn
- pandas
- numpy
- pip:
  - azureml-defaults

Writing conda_dependencies.yml


In [7]:
from azureml.core import Environment
sklearn_env = Environment.from_conda_specification(name='sklearn-env-proj', file_path='./conda_dependencies.yml')

## Hyperdrive Configuration

The following cell will specify the hyperdrive configuration that will be used to find the best hyperparameters for the model. I will be using a Random Forest classifier to predict the water potability metric of the dataset.


The hyperparameter tuning will consider 2 of the most important hyper parameters for Random Forests: n_estimators for the number of trees, max_depth for the maximum levels in the tree. I also added criterion to test if this parameter makes a significant difference. 

In [8]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform
from azureml.core import ScriptRunConfig

early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

param_sampling = RandomParameterSampling(
    {
        'n_estimators': choice(range(50, 300)),
        'max_depth': choice(range(20, 50))                         
    }
)


run_config = ScriptRunConfig(source_directory=".",
                             script="./train_randomforest.py",
                             #instead of mounting files, I access the workspace and dataset directly in the training script
                             #arguments=['--data-folder', dataset.as_named_input("data").as_mount()],
                             compute_target=compute_target,
                             environment=sklearn_env
                            )


hyperdrive_run_config = HyperDriveConfig(run_config=run_config,
                                         hyperparameter_sampling=param_sampling,
                                         policy=early_termination_policy,
                                         primary_metric_name='AUC_weighted',
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_duration_minutes=60,
                                         max_total_runs=40,
                                         max_concurrent_runs=9)

In [9]:
experiment_run = experiment.submit(config=hyperdrive_run_config)

## Run Details

In [10]:
from azureml.widgets import RunDetails
RunDetails(experiment_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

The following cells will retrieve the best performing model by first identifying the run with the best metric and then downloading the files associated to the run (the training script persisted the model and stored it in the outputs directory, which is automatically uploaded to the Run in the workspace).

In [11]:
best_run = experiment_run.get_best_run_by_primary_metric()
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
hyperdrive_h2o_potability,HD_1f774934-e00e-433c-91db-31cfd92c894a_30,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [12]:
best_run.get_details()['runDefinition']['arguments']

['--max_depth', '21', '--n_estimators', '226']

In [13]:
best_run.get_metrics()

{'Number of estimators:': 226,
 'Max depth:': 21,
 'AUC_weighted': 0.6326742575780889}

In [14]:
best_run.download_files(prefix="outputs")

In [15]:
import joblib
import glob

model = joblib.load(glob.glob("outputs/model*.joblib")[0])

In [16]:
type(model)

sklearn.ensemble._forest.RandomForestClassifier

## Model Deployment

As the performance of the model trained with HyperDrive was lower than the performance of the model obtained with AutoML, I will not deploy the model obtained with HyperDrive, but will only register it

In [17]:
from azureml.core.model import Model

model = Model.register(model_path=glob.glob("outputs/model*.joblib")[0],
                       model_name="water_potability_hyperdrive",
                       tags={"data": "water potatibility kaggle", "model": "classification"},
                       description="Random Forest Classifier that predicts water potability. Hyperparameters tuned with HyperDrive",
                       workspace=ws)

Registering model water_potability_hyperdrive
