# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [3]:
import logging
import os

import numpy as np
import pandas as pd

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

In [4]:
#files and directories
os.listdir(os.curdir)


['.ipynb_aml_checkpoints',
 'conda_dependencies.yml',
 'hyperparameter_tuning (1).ipynb',
 'keras_train.py',
 'scoring.py']

# Initialize the Workspace and an Experiment


In [5]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

experiment_name = 'keras_housing'
project_folder = '.'
os.makedirs(project_folder, exist_ok=True)

experiment = Experiment(ws, experiment_name)
experiment

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code RS29HA6BS to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
quick-starts-ws-127505
aml-quickstarts-127505
southcentralus
ebee3a56-4c54-406a-b732-174015826780


Name,Workspace,Report Page,Docs Page
keras_housing,quick-starts-ws-127505,Link to Azure Machine Learning studio,Link to Documentation


# Create or Attach a Compute Resource

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException


amlcompute_cluster_name = "compute-gpu"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


# Environment Set Up

In [7]:
from azureml.core import Environment

keras_env = Environment.from_conda_specification(name = 'keras-2.3.1', file_path = 'conda_dependencies.yml')

# Specify a GPU base image
keras_env.docker.enabled = True
keras_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'

In [8]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive import choice, loguniform
from azureml.widgets import RunDetails

## Dataset

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv("housing.csv", header=0)

def prepare_data(data):
    encoded_column=pd.get_dummies(data['ocean_proximity'], prefix='ocp')
    data=data.join(encoded_column)
    data=data.drop("ocean_proximity", axis=1)

    target="median_house_value"
    y=data[target]
    x=data.drop(target, axis=1)
    
    return x,y

def split_scale(x,y):
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=42) # returns Dataframes and Series
    x_train = x_train.fillna(x_train.mean())
    x_test = x_test.fillna(x_test.mean())

    columns=x_train.columns.to_list()
    scaler_x = MinMaxScaler().fit(x_train[columns])
    scaled_x_train = scaler_x.transform(x_train[columns]) # numpy array
    scaled_x_test = scaler_x.transform(x_test[columns])

    

    return scaled_x_train, scaled_x_test,  y_train.values, y_test.values
x,y = prepare_data(data)
x_train, x_test, y_train, y_test  = split_scale(x,y)

In [16]:
x_train_frame=pd.DataFrame(x_train, columns=x.columns)
x_test_frame=pd.DataFrame(x_test, columns=x.columns)
x_train_frame.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocp_<1H OCEAN,ocp_INLAND,ocp_ISLAND,ocp_NEAR BAY,ocp_NEAR OCEAN
0,0.729084,0.017021,0.627451,0.079455,0.097145,0.06438,0.102286,0.190322,0.0,0.0,0.0,0.0,1.0
1,0.616534,0.129787,0.941176,0.085966,0.121974,0.036744,0.124157,0.228452,0.0,0.0,0.0,0.0,1.0
2,0.385458,0.224468,0.058824,0.048197,0.05121,0.025561,0.05509,0.252162,0.0,0.0,0.0,0.0,1.0
3,0.721116,0.014894,0.686275,0.03609,0.056797,0.039659,0.058214,0.099488,0.0,0.0,0.0,0.0,1.0
4,0.453187,0.45,0.823529,0.060532,0.066729,0.024412,0.062325,0.210638,0.0,1.0,0.0,0.0,0.0


In [17]:
x_train_frame["median_house_value"]=y_train 
x_test_frame["median_house_value"]=y_test

x_train_frame.shape, x_test_frame.shape
x_train_frame.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocp_<1H OCEAN,ocp_INLAND,ocp_ISLAND,ocp_NEAR BAY,ocp_NEAR OCEAN,median_house_value
0,0.729084,0.017021,0.627451,0.079455,0.097145,0.06438,0.102286,0.190322,0.0,0.0,0.0,0.0,1.0,103000.0
1,0.616534,0.129787,0.941176,0.085966,0.121974,0.036744,0.124157,0.228452,0.0,0.0,0.0,0.0,1.0,382100.0
2,0.385458,0.224468,0.058824,0.048197,0.05121,0.025561,0.05509,0.252162,0.0,0.0,0.0,0.0,1.0,172600.0
3,0.721116,0.014894,0.686275,0.03609,0.056797,0.039659,0.058214,0.099488,0.0,0.0,0.0,0.0,1.0,93400.0
4,0.453187,0.45,0.823529,0.060532,0.066729,0.024412,0.062325,0.210638,0.0,1.0,0.0,0.0,0.0,96500.0


In [18]:
x_train_frame.to_pickle('train.pkl')
x_test_frame.to_pickle('test.pkl')

In [19]:
train=pd.read_pickle("train.pkl")
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocp_<1H OCEAN,ocp_INLAND,ocp_ISLAND,ocp_NEAR BAY,ocp_NEAR OCEAN,median_house_value
0,0.729084,0.017021,0.627451,0.079455,0.097145,0.06438,0.102286,0.190322,0.0,0.0,0.0,0.0,1.0,103000.0
1,0.616534,0.129787,0.941176,0.085966,0.121974,0.036744,0.124157,0.228452,0.0,0.0,0.0,0.0,1.0,382100.0
2,0.385458,0.224468,0.058824,0.048197,0.05121,0.025561,0.05509,0.252162,0.0,0.0,0.0,0.0,1.0,172600.0
3,0.721116,0.014894,0.686275,0.03609,0.056797,0.039659,0.058214,0.099488,0.0,0.0,0.0,0.0,1.0,93400.0
4,0.453187,0.45,0.823529,0.060532,0.066729,0.024412,0.062325,0.210638,0.0,1.0,0.0,0.0,0.0,96500.0


In [20]:
datastore=ws.get_default_datastore()
datastore.upload_files(['train.pkl'])

Uploading an estimated of 1 files
Uploading train.pkl
Uploaded train.pkl, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_workspaceblobstore

In [21]:
datastore.upload_files(['test.pkl'])

Uploading an estimated of 1 files
Uploading test.pkl
Uploaded test.pkl, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_workspaceblobstore

In [22]:
os.listdir(os.curdir)

['.ipynb_aml_checkpoints',
 'conda_dependencies.yml',
 'housing.csv',
 'hyperparameter_tuning (1).ipynb',
 'keras_train.py',
 'scoring.py',
 'test.pkl',
 'train.pkl']

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [23]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
policy =  BanditPolicy(evaluation_interval=2, slack_factor=0.1, slack_amount=None, delay_evaluation=0)

#TODO: Create the different params that you will be using during training
ps = RandomParameterSampling(
    {
        '--batch-size': choice(25, 50, 100),
        '--number-epochs': choice(5,10,15),
        '--first-layer-neurons': choice(10, 50, 200, 300, 500),
        '--second-layer-neurons': choice(10, 50, 200, 500),
    }
)

#TODO: Create your estimator and hyperdrive config
src = ScriptRunConfig(source_directory=project_folder,
                      script='keras_train.py',
                      compute_target=compute_target,
                      environment=keras_env)

hyperdrive_config = HyperDriveConfig(
    hyperparameter_sampling = ps, 
    primary_metric_name ='MAE', 
    primary_metric_goal = PrimaryMetricGoal.MINIMIZE, 
    max_total_runs = 8, 
    max_concurrent_runs=4, 
    policy=policy, 
    run_config=src
)

In [24]:
#TODO: Submit your experiment
hyperdrive_run = experiment.submit(hyperdrive_config, show_output=True)
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [25]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_49c0cea0-acdf-4a12-8b7c-c495033cff38
Web View: https://ml.azure.com/experiments/keras_housing/runs/HD_49c0cea0-acdf-4a12-8b7c-c495033cff38?wsid=/subscriptions/ebee3a56-4c54-406a-b732-174015826780/resourcegroups/aml-quickstarts-127505/workspaces/quick-starts-ws-127505

Execution Summary
RunId: HD_49c0cea0-acdf-4a12-8b7c-c495033cff38
Web View: https://ml.azure.com/experiments/keras_housing/runs/HD_49c0cea0-acdf-4a12-8b7c-c495033cff38?wsid=/subscriptions/ebee3a56-4c54-406a-b732-174015826780/resourcegroups/aml-quickstarts-127505/workspaces/quick-starts-ws-127505

{
  "error": {
    "code": "UserError",
    "severity": null,
    "message": "User errors were found in at least one of the child runs.",
    "messageFormat": null,
    "messageParameters": {},
    "referenceCode": null,
    "detailsUri": null,
    "target": null,
    "details": [],
    "innerError": null,
    "debugInfo": null
  },
  "correlation": null,
  "environment": null,
  "location": null,
  "time": "0001-01-01T0

{'runId': 'HD_49c0cea0-acdf-4a12-8b7c-c495033cff38',
 'target': 'compute-gpu',
 'status': 'Completed',
 'startTimeUtc': '2020-11-23T08:44:47.301535Z',
 'endTimeUtc': '2020-11-23T08:49:10.504375Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
   'message': '{\n  "error": {\n    "code": "UserError",\n    "severity": null,\n    "message": "User errors were found in at least one of the child runs.",\n    "messageFormat": null,\n    "messageParameters": {},\n    "referenceCode": null,\n    "detailsUri": null,\n    "target": null,\n    "details": [],\n    "innerError": null,\n    "debugInfo": null\n  },\n  "correlation": null,\n  "environment": null,\n  "location": null,\n  "time": "0001-01-01T00:00:00+00:00",\n  "componentName": null\n}'}],
 'properties': {'primary_metric_config': '{"name": "MAE", "goal": "minimize"}',
  'resume_from':

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [26]:
best_run=hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

In [28]:
best_run_metrics

{'Batch Size': 50,
 'Epochs': 15,
 'Loss': 5399614188.155039,
 'MAE': 53041.2109375}

In [29]:
print('Bets Run ID', best_run.id)
print('\n MAE', best_run_metrics['MAE'])

Bets Run ID HD_49c0cea0-acdf-4a12-8b7c-c495033cff38_6

 MAE 53041.2109375


In [33]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
keras_housing,HD_49c0cea0-acdf-4a12-8b7c-c495033cff38_6,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [30]:
os.listdir(os.curdir)

['.ipynb_aml_checkpoints',
 'conda_dependencies.yml',
 'housing.csv',
 'hyperparameter_tuning (1).ipynb',
 'keras_train.py',
 'scoring.py',
 'test.pkl',
 'train.pkl']

In [44]:
#TODO: Save the best model
# in keras_housing_train.py the model was saved in ./outputs_keras/model/

# Register model
model = best_run.register_model(model_name='keras-housing-model', model_path='./outputs/model')

In [46]:
model.name

'keras-housing-model'