# Hyperparameter Tuning using HyperDrive

In [2]:
import os
import sys
import json
import azureml
import logging
import requests
import pandas as pd
import numpy as np
from io import BytesIO
from sklearn.externals import joblib

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.core import ScriptRunConfig
from azureml.core import Environment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.estimator import Estimator

from azureml.core.dataset import Dataset
from azureml.widgets import RunDetails
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.sampling import BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform, choice

from azureml.core.runconfig import RunConfiguration
from azureml.core.runconfig import EnvironmentDefinition
from azureml.core.runconfig import CondaDependencies

from azureml.core.model import Model

from azureml.core.webservice import AciWebservice
from azureml.core.model import Model, InferenceConfig

from azureml.train.automl import constants

from train import *

import warnings
warnings.filterwarnings("ignore")

# Check system and core SDK version number
print("System version: {}".format(sys.version))
print("SDK version:", azureml.core.VERSION)

System version: 3.6.13 |Anaconda, Inc.| (default, Feb 23 2021, 12:58:59) 
[GCC Clang 10.0.0 ]
SDK version: 1.23.0


In [3]:
ws = Workspace.from_config()
experiment_name = 'online_news_hyperdrive'
experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
online_news_hyperdrive,quick-starts-ws-141910,Link to Azure Machine Learning studio,Link to Documentation


In [4]:
dic_data = {'Workspace name': ws.name,
            'Azure region': ws.location,
            'Subscription id': ws.subscription_id,
            'Resource group': ws.resource_group,
            'Experiment Name': experiment.name}

df_data = pd.DataFrame.from_dict(data = dic_data, orient='index')

df_data.rename(columns={0:''}, inplace = True)
df_data

Unnamed: 0,Unnamed: 1
Workspace name,quick-starts-ws-141910
Azure region,southcentralus
Subscription id,976ee174-3882-4721-b90a-b5fef6b72f24
Resource group,aml-quickstarts-141910
Experiment Name,online_news_hyperdrive


## Create or Attach an AmlCompute cluster

In [5]:
# Define CPU cluster name
compute_target_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
    print("Found existing cpu-cluster. Use it.")
except ComputeTargetException:
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_DS12_V2",
                                                           min_nodes=1, 
                                                           max_nodes=4) 
    # Create the cluster with the specified name and configuration
    compute_target = ComputeTarget.create(ws, compute_target_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())

Found existing cpu-cluster. Use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-04-04T00:55:49.998000+00:00', 'errors': None, 'creationTime': '2021-04-04T00:47:24.890236+00:00', 'modifiedTime': '2021-04-04T00:47:56.167795+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS12_V2'}


## Dataset

The dataset used in this project is a dataset made available on UCI Machine Learning Repository called [Online News Popularity Data Set](https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity#).

The dataset summarizes heterogeneous set of features about the articles published by Mashable between 2013 and 2015.

- Number of Instances: 39797
- Number of Attributes: 61 
    - 58 predictive attributes 
    - 2 non-predictive (`url` and `timedelta`) 
    - 1 target column
    
We will also apply the `Boruta` step for feature selection before exporting the data to our defined Datastore

In [6]:
DATA_LOC = "https://raw.githubusercontent.com/franckess/AzureML_Capstone/main/data/OnlineNewsPopularity.csv"
BORUTA_LOC = "https://github.com/franckess/AzureML_Capstone/releases/download/1.0/boruta_model.pkl"

# Loading data
df = pd.read_csv(DATA_LOC)

# Removing space character in the feature names
df.columns=df.columns.str.replace(' ','')

# Drop URL column
df = df.drop(['url'], axis=1)

# Perform Data pre-processing
df = corr_drop_cols(df)
df = create_label(df)
df = scaling_num(df)
df = feature_selection(df, BORUTA_LOC)
    
# Split train data into train & test
X_train, X_test, y_train, y_test = split_train_test(df)

m, k = X_train.shape
print("{} x {} table of data:".format(m, k))
X_train.info()

31757 x 36 table of data:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31757 entries, 7887 to 39643
Data columns (total 36 columns):
n_tokens_content                 31757 non-null float64
n_unique_tokens                  31757 non-null float64
num_hrefs                        31757 non-null float64
num_self_hrefs                   31757 non-null float64
num_imgs                         31757 non-null float64
average_token_length             31757 non-null float64
num_keywords                     31757 non-null float64
data_channel_is_entertainment    31757 non-null float64
data_channel_is_bus              31757 non-null float64
data_channel_is_socmed           31757 non-null float64
data_channel_is_tech             31757 non-null float64
data_channel_is_world            31757 non-null float64
kw_min_min                       31757 non-null float64
kw_max_min                       31757 non-null float64
kw_min_max                       31757 non-null float64
kw_avg_max            

## Upload data to Azure Datatore

In [7]:
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data/OnlineNewsPopularity.csv'], target_path = 'dataset/', show_progress = True)

Uploading an estimated of 1 files
Target already exists. Skipping upload for dataset/OnlineNewsPopularity.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_962a4a5a2de34844935295abab24a083

In [8]:
print(
    "Datastore type: " + datastore.datastore_type,
    "Account name: " + datastore.account_name,
    "Container name: " + datastore.container_name,
    sep="\n",
)

Datastore type: AzureBlob
Account name: mlstrg141910
Container name: azureml-blobstore-a8749507-2159-41e6-b4ad-125957a5d2db


In [9]:
# Get data reference object for the data path
ds_data = datastore.path('dataset/')
print(ds_data)

$AZUREML_DATAREFERENCE_65fcfcc61bf844c3b3ab2dfacbf372e1


In [11]:
from azureml.core.dataset import Dataset

df_temp = Dataset.Tabular.from_delimited_files(path=datastore.path('dataset/OnlineNewsPopularity.csv'))
df_temp = df_temp.to_pandas_dataframe()
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
url                               39644 non-null object
 timedelta                        39644 non-null object
 n_tokens_title                   39644 non-null object
 n_tokens_content                 39644 non-null object
 n_unique_tokens                  39644 non-null object
 n_non_stop_words                 39644 non-null object
 n_non_stop_unique_tokens         39644 non-null object
 num_hrefs                        39644 non-null object
 num_self_hrefs                   39644 non-null object
 num_imgs                         39644 non-null object
 num_videos                       39644 non-null object
 average_token_length             39644 non-null object
 num_keywords                     39644 non-null object
 data_channel_is_lifestyle        39644 non-null object
 data_channel_is_entertainment    39644 non-null object
 data_channel_is_bus              39644 non-null ob

In [12]:
del df_temp

## HyperDrive Configuration

### Tune hyperparameters using `HyperDrive`

In Tthe following blcok, I tune my set of hyperparameters for the `LightGBM` model. The ranges of parameters for the `LightGBM` used were chosen considering the parameters tuning guides for different scenarios provided here.

The code below does a parallel search of the hyperparameter space using a Bayesian sampling method which does not support termination policy. Therefore, `'policy = None'`.

For Bayesian Sampling we recommend using a maximum number of runs (at least 20 times the number of hyperparameters being tuned). We set the maximum number of child runs of HyperDrive max_total_runs to 20 to reduce the running time.

In order to compare the performance of HyperDrive with the one of AutoML we chose as objective metric of `LightGBM` __accuracy score__.

In [13]:
udacityEnv = Environment.from_pip_requirements(name = 'udacity-env', file_path = './udacity_env.yml')
udacityEnv

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210129.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": "2g"
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "udacity-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"


In [14]:
if "training" not in os.listdir():
    os.mkdir("./training")

est = Estimator( 
    source_directory='./',
    compute_target=compute_target,
    entry_script='train.py',
    script_params={"--data-folder": ds_data.as_mount()},
    environment_definition=udacityEnv
)

# Specify hyperparameter space
param_sampling = BayesianParameterSampling(
    {
        "--num-leaves": quniform(8, 128, 1),
        "--min-data-in-leaf": quniform(20, 500, 10),
        "--learning-rate": choice(
            1e-4, 1e-3, 5e-3, 1e-2, 1.5e-2, 2e-2, 3e-2, 5e-2, 1e-1
        ),
        "--feature-fraction": uniform(0.2, 1),
        "--bagging-fraction": uniform(0.1, 1),
        "--bagging-freq": quniform(1, 20, 1),
        "--max-rounds": quniform(50, 2000, 10)
    }
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=param_sampling,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=50, 
    max_concurrent_runs=4,
    policy=None
)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [15]:
# Submit hyperdrive run to the experiment 
hyperdrive_run = experiment.submit(config = hyperdrive_config, show_output=True)



## Run Details

In [16]:
# Show run details with the Jupyter widget
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
hyperdrive_run.get_metrics()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_487dc78d-4039-4679-92a5-8bd8fc21c2bd
Web View: https://ml.azure.com/experiments/online_news_hyperdrive/runs/HD_487dc78d-4039-4679-92a5-8bd8fc21c2bd?wsid=/subscriptions/976ee174-3882-4721-b90a-b5fef6b72f24/resourcegroups/aml-quickstarts-141910/workspaces/quick-starts-ws-141910

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-04-04T02:50:34.229329][API][INFO]Experiment created<END>\n""<START>[2021-04-04T02:50:34.780660][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-04-04T02:50:34.941795][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-04-04T02:50:35.4633364Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>
{
  "azureml-logs/20_image_build_log.txt": "https://mlstrg141910.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_487dc78d-4039-4679-92a5-8bd8fc21c2bd_preparation/azureml-logs/20_image_

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Environment preparation run marked as Failed by execution service due to UserError.",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Environment preparation run marked as Failed by execution service due to UserError.\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}