# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv
import time
import shutil
import joblib
import requests

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException


from azureml.core.experiment import Experiment
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling, BayesianParameterSampling
from azureml.core import ScriptRunConfig 
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
from azureml.widgets import RunDetails


from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import Model, InferenceConfig

from azureml.train.sklearn import SKLearn

#This is used to import kaggle data
!pip install opendatasets
import opendatasets as od



SDK version: 1.34.0
Collecting opendatasets
  Downloading opendatasets-0.1.20-py3-none-any.whl (14 kB)
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 2.2 MB/s eta 0:00:011
Collecting python-slugify
  Downloading python_slugify-5.0.2-py2.py3-none-any.whl (6.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.8 MB/s  eta 0:00:01
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l- \ done
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=160f08f4c10ae11d2b1ebd9aab018863db28efef70813d60ca80a0afdad539fa
  Stored in directory: /home/azureuser/.cache/pip/wheels/77/47/e4/44a4ba1b7dfd53faaa35f59f1175e123b213ff401a8a56876b
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle, opendatasets
Successfully inst

## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [3]:


def download_data(url, dsname, downloadpath, file):
    if not os.path.exists(downloadpath):
        os.makedirs(downloadpath)

    od.download(url)
    data_downloadpath = os.path.join(dsname, file)
    data_finalpath = os.path.join(downloadpath, file)
    
    os.rename(data_downloadpath, data_finalpath)
    while not os.path.exists(data_finalpath):
        time.sleep(1)
    shutil.rmtree(f"./{dsname}")
    
    return data_finalpath


ws = Workspace.from_config()
dataurl = 'https://www.kaggle.com/rupeshraundal/marketcheck-automotive-data-us-canada/download'
kaggledsname = 'marketcheck-automotive-data-us-canada'
filenm = 'ca-dealers-used.csv'
downloadpath = 'Data'
dskey = 'UsedCarPrices'
dsdesc = 'Used Car Price Training Data'
found = False

dataset = None
if dskey in ws.datasets.keys(): 
    print(f"Retrieving registered dataset with key {dskey}")
    found = True
    dataset = ws.datasets[dskey] 

if not found:
    print("Downloading Canadian used car prices dataset from Kaggle")
    filepath = downloadpath

    
    if not os.path.exists(os.path.join(downloadpath, filenm)):
        datapath = download_data(dataurl, kaggledsname, downloadpath, filenm)
    
    
    # get the datastore to upload prepared data
    datastore = ws.get_default_datastore()
    
    
    print("Uploading to azure storage")
    # upload the local file from src_dir to the target_path in datastore
    datastore.upload(src_dir=downloadpath, target_path=downloadpath)
    print("Converting to azure dataset")
    dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, (os.path.join(filepath, filenm)))], separator=',', header=True, encoding='utf8')
    
    print("Registering dataset")
    dataset = dataset.register(workspace=ws,
                                 name=dskey,
                                 description=dsdesc)
    # dataset = Dataset.Tabular.from_delimited_files(os.join(datapath, filenm), separator=',', header=True, encoding='utf8')
    # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py
    # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets#explore-data
    # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets


print(f"Dataset ID: {dataset.id}")
print(f"Dataset name: {dataset.name}")
print(f"Dataset description: {dataset.description}")


Downloading Canadian used car prices dataset from Kaggle
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Downloading marketcheck-automotive-data-us-canada.zip to ./marketcheck-automotive-data-us-canada

Uploading to azure storage
Uploading an estimated of 1 files
Uploading Data/ca-dealers-used.csv
Uploaded Data/ca-dealers-used.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Converting to azure dataset
Registering dataset
Dataset ID: 738d46fe-734a-40b3-8630-b917b098f945
Dataset name: UsedCarPrices
Dataset description: Used Car Price Training Data


100%|██████████| 420M/420M [00:09<00:00, 45.5MB/s] 


In [4]:
ws = Workspace.from_config()
experiment_name = 'PCAP_Exp_CarPrices_HDrive'
exp=Experiment(ws, experiment_name)

print("Configuring compute cluster")
amlcompute_cluster_name = "PCAP-Clus-DS12v2"
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           min_nodes = 3, idle_seconds_before_scaledown = 600,
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)




Configuring compute cluster
InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded......................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [5]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
# This will not be used since bayesianparametersampling will be used
# early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#TODO: Create the different params that you will be using during training
# Criterion are specific to 0.24 sklearn
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
param_sampling = BayesianParameterSampling({
        "n_estimators": choice(10, 100),
        'max_Depth': choice(5, 10, 30),
        'criterion': choice("mse","mae"), 
        'max_Features': choice('sqrt','log2','auto') 
        }
)


myenv = Environment.get(workspace=ws, name="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu")

dir_output = './outputs'
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      arguments=['--output_dir', dir_output, '--input_data', dskey],#dataset.as_named_input('UsedAutoPrices')],
                      compute_target=compute_target,
                      environment=myenv)

hyperdrive_run_config = HyperDriveConfig(run_config=src,
                         hyperparameter_sampling=param_sampling,
                         # policy=bandpolicy,
                         primary_metric_name="r2_score",
                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                         max_total_runs=60, 
                         max_concurrent_runs=5)

For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Recommendend value:80.


In [6]:
#TODO: Submit your experiment

hdrive_run = exp.submit(hyperdrive_run_config, show_output = False)


## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [7]:
RunDetails(hdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [8]:

# Get your best run and save the model from that run.

### YOUR CODE HERE ###
hdrive_run.wait_for_completion(show_output=True)
best_run = hdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('\n R2 Score:', best_run_metrics['r2_score'])
print('\n Number of estimators:',parameter_values[11])
print('Max Depth:',parameter_values[7])
print('Criterion:',parameter_values[5])
print('Max Features:',parameter_values[9])
print('\n FileNames:', best_run.get_file_names())

foldername="BestHdriveModel"
#joblib.dump(value=best_run, filename=filename)


RunId: HD_ea515564-49b9-4050-87cf-5c83cf088ab4
Web View: https://ml.azure.com/runs/HD_ea515564-49b9-4050-87cf-5c83cf088ab4?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-160752/workspaces/quick-starts-ws-160752&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Execution Summary
RunId: HD_ea515564-49b9-4050-87cf-5c83cf088ab4
Web View: https://ml.azure.com/runs/HD_ea515564-49b9-4050-87cf-5c83cf088ab4?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-160752/workspaces/quick-starts-ws-160752&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Best Run Id:  HD_ea515564-49b9-4050-87cf-5c83cf088ab4_2

 R2 Score: 0.9690785896792928

 Number of estimators: sqrt
Max Depth: 30
Criterion: 100
Max Features: mse

 FileNames: ['azureml-logs/55_azureml-execution-tvmps_44fd9cdc2ac6c2ee7da7bb9c62ffe624d415cdf0c465c8a3c9d0dc59948fc32b_d.txt', 'azureml-logs/65_job_prep-tvmps_44fd9cdc2ac6c2ee7da7bb9c62ffe624d415cdf0c465c8a3c9d0dc59948fc32b_d.txt'

In [9]:
#TODO: Save the best model
for f in best_run.get_file_names():
    if f.startswith(f"{dir_output.replace('./','')}/model"):
        output_file_path = os.path.join(f'./model/{foldername}', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        best_run.download_file(name=f, output_file_path=output_file_path)
        print('Path: ' + output_file_path)
        modeldetails = joblib.load(output_file_path)
        print('Model details:')
        print(modeldetails)


Downloading from outputs/model/RF-AutoPrices_hdrive_model.joblib to ./model/BestHdriveModel/RF-AutoPrices_hdrive_model.joblib ...
Path: ./model/BestHdriveModel/RF-AutoPrices_hdrive_model.joblib
Model details:
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


Trying to unpickle estimator DecisionTreeRegressor from version 0.24.1 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.
Trying to unpickle estimator RandomForestRegressor from version 0.24.1 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [10]:
model = best_run.register_model(model_name='RFHD-AutoPrices', model_path=f'{dir_output}/model/RF-AutoPrices_hdrive_model.joblib')

In [11]:
# model = Model.get_model_path('RFHD-AutoPrices')
# ws = Workspace.from_config()
# model = Model(ws, name="RFHD-AutoPrices", version=1)

model

Model(workspace=Workspace.create(name='quick-starts-ws-160752', subscription_id='5a4ab2ba-6c51-4805-8155-58759ad589d8', resource_group='aml-quickstarts-160752'), name=RFHD-AutoPrices, id=RFHD-AutoPrices:1, version=1, tags={}, properties={})

In [12]:
# https://stackoverflow.com/questions/61803031/azure-ml-include-additional-files-during-model-deployment
# Disabled since the rubric implies a specification 'file' is required, suggesting a custom yaml file
# myenv_inference = Environment(workspace=ws, name="AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference") 
mysklearn_env = Environment.from_conda_specification(name='mysklearn-env', file_path='myenv_dependencies_hdrive.yml')
# from azureml.core import Image
# build = mysklearn_env.build(workspace=ws)
# build.wait_for_completion(show_output=True)

# Combine scoring script & environment in Inference configuration
dummy_inference_config = InferenceConfig(entry_script="score_hdrive.py", environment=mysklearn_env, source_directory='./')

# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 3, auth_enabled=True)

ws_name = "svc-autoprices-hdrive"
service = Model.deploy(
    ws,
    ws_name,
    [model],
    inference_config = dummy_inference_config,
    deployment_config = deployment_config,
    overwrite=True
)
#service.wait_for_deployment(show_output=True)

No Python version provided, defaulting to "3.6.2"


In [13]:
# service.wait_for_deployment(show_output=True)
service.wait_for_deployment(show_output=True)
print(f"Service Health: {service.state}")
print(f"Scoring URI: {service.scoring_uri}")
primary, secondary = service.get_keys()
print(f"Key: {primary}")

#Enable app insights
print("Enabling App Insights")
service.update(enable_app_insights=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-10-11 20:08:49+00:00 Creating Container Registry if not exists..
2021-10-11 20:18:49+00:00 Registering the environment.
2021-10-11 20:18:51+00:00 Building image..
2021-10-11 20:24:35+00:00 Generating deployment configuration.
2021-10-11 20:24:36+00:00 Submitting deployment to compute..
2021-10-11 20:24:40+00:00 Checking the status of deployment svc-autoprices-hdrive..
2021-10-11 20:29:54+00:00 Checking the status of inference endpoint svc-autoprices-hdrive.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Service Health: Healthy
Scoring URI: http://05182ee3-06e7-4a09-8a08-6648125f0ce7.southcentralus.azurecontainer.io/score
Key: l0uztjcPw6mqvyndZXdcLFFtI195DybV
Enabling App Insights


TODO: In the cell below, send a request to the web service you deployed to test it.

In [32]:
import json
service = Webservice(workspace=ws, name=ws_name)



# URL for the web service
scoring_uri = service.scoring_uri
# If the service is authenticated, set the key or token
key = primary

# Two sets of data to score, so we get two results back

#Generally, I figure we would send in all the columns from the original dataset and 
#then run it through the clean function to whittle it down, but that's not necessary for this exercise.
data = {
    "data":
    [
        {
            'miles': "51000",
            'year': "2018",
            'make': "Ford",
            'model': "F-150",
            'engine_size': "5",
            'body_type': "Pickup",
            'vehicle_type': "Truck",
            'drivetrain': "4WD",
            'transmission': "Automatic",
            'fuel_type': "Gas",
            'state': "ON"
        }
    ,
        {
            'miles': "58900",
            'year': "2018",
            'make': "Toyota",
            'model': "Tundra",
            'engine_size': "2",
            'body_type': "Sedan",
            'vehicle_type': "Car",
            'drivetrain': "FWD",
            'transmission': "Automatic",
            'fuel_type': "Gas",
            'state': "AB"
        }
    ]
}
# Convert to JSON string
input_data = json.dumps(data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)

[107761.09, 116513.98]


TODO: In the cell below, print the logs of the web service and delete the service

In [33]:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics

print(service.get_logs())



2021-10-11T20:39:15,035422300+00:00 - iot-server/run 
2021-10-11T20:39:15,053836300+00:00 - gunicorn/run 
Dynamic Python package installation is disabled.
Starting HTTP server
2021-10-11T20:39:15,057760800+00:00 - rsyslog/run 
2021-10-11T20:39:15,090211400+00:00 - nginx/run 
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2021-10-11T20:39:15,645963500+00:00 - iot-server/finish 1 0
2021-10-11T20:39:15,648260000+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (80)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 106
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2021-10-11 20:39:19,305 | root | INFO | Starting up app insights client
logging socket was found. logging is available.
logging socket was found. logging is available.
2021-10-11 20:39:19,308 | root | INFO | Starting up request id generator
2021-10-11 20:39:19,308 | root | INFO | Sta

In [34]:
service.delete()