# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [6]:
### REVIEWED

import pandas as pd
import numpy as np
import os
import sys
import json
import azureml
import requests 

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.core import ScriptRunConfig

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.estimator import Estimator

from azureml.core.dataset import Dataset
from azureml.widgets import RunDetails
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.sampling import BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform, choice

from azureml.core.runconfig import RunConfiguration
from azureml.core.runconfig import EnvironmentDefinition
from azureml.core.runconfig import CondaDependencies

from azureml.core.model import Model

from azureml.core.webservice import AciWebservice
from azureml.core.model import Model, InferenceConfig


# onnx

from azureml.automl.runtime.onnx_convert import OnnxConverter
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants
import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

import warnings
warnings.filterwarnings("ignore")

from train_v050221 import *

# Check system and core SDK version number
print("System version: {}".format(sys.version))
print("Azure ML SDK version:", azureml.core.VERSION)

System version: 3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]
Azure ML SDK version: 1.20.0


# Initialize workspace and create an Azure ML experiment

To start we need to initialize our workspace and create a Azule ML experiment. It is also to remember that accessing the Azure ML workspace requires authentication with Azure.

Make sure the config file is present at `.\config.json`. This file can be downloaded from home of Azure Machine Learning Studio.

In [7]:
#Define the workspace
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-137316
aml-quickstarts-137316
southcentralus
2c48c51c-bd47-40d4-abbe-fb8eabd19c8c


In [8]:
#Create an experiment
experiment_name = 'hyper-lgbm-walmart-forecasting'
experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
hyper-lgbm-walmart-forecasting,quick-starts-ws-137316,Link to Azure Machine Learning studio,Link to Documentation


In [9]:
dic_data = {'Workspace name': ws.name,
            'Azure region': ws.location,
            'Subscription id': ws.subscription_id,
            'Resource group': ws.resource_group,
            'Experiment Name': experiment.name}

df_data = pd.DataFrame.from_dict(data = dic_data, orient='index')

df_data.rename(columns={0:''}, inplace = True)
df_data

Unnamed: 0,Unnamed: 1
Workspace name,quick-starts-ws-137316
Azure region,southcentralus
Subscription id,2c48c51c-bd47-40d4-abbe-fb8eabd19c8c
Resource group,aml-quickstarts-137316
Experiment Name,hyper-lgbm-walmart-forecasting


# Create or Attach an AmlCompute cluster

In [10]:
# Define CPU cluster name
compute_target_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
    print("Found existing cpu-cluster. Use it.")
except ComputeTargetException:
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_DS12_V2",
                                                           min_nodes=1, # when innactive
                                                           max_nodes=4) # when busy
    # Create the cluster with the specified name and configuration
    compute_target = ComputeTarget.create(ws, compute_target_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())

Found existing cpu-cluster. Use it.

Running
{'errors': [], 'creationTime': '2021-02-05T13:42:35.322335+00:00', 'createdBy': {'userObjectId': '6dc4b7b2-c1fe-4224-9968-0de504d06907', 'userTenantId': '660b3398-b80e-49d2-bc5b-ac1dc93b5254', 'userName': None}, 'modifiedTime': '2021-02-05T13:45:08.140318+00:00', 'state': 'Running', 'vmSize': 'STANDARD_DS12_V2'}


# Configure Docker environment

The remote compute will need to create a [Docker image](https://docs.docker.com/get-started/) for running the script. The Docker image is an encapsulated environment with necessary dependencies installed. In the following cell, we specify the conda packages and Python version that are needed for running the script.

In [11]:
env = EnvironmentDefinition()
env.python.user_managed_dependencies = False
env.python.conda_dependencies = CondaDependencies.create(
    conda_packages=["pandas", "numpy", "scipy", "scikit-learn", "lightgbm", "joblib"],
    python_version="3.6.2",
)
env.python.conda_dependencies.add_channel("conda-forge")
env.docker.enabled = True

# Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

## Overview

The dataset used in this project is a small subset of a much bigger dataset made available at Kaggle's competition [M5 Forecasting - Accuracy Estimate the unit sales of Walmart retail goods](https://www.kaggle.com/c/m5-forecasting-accuracy/overview/description).

The complete dataset covers stores in three US States (California, Texas, and Wisconsin) and includes item level, department, product categories, and store details. In addition, it has explanatory variables such as price, promotions, day of the week, and special events. **The task is to forecast daily sales for the next 28 days.**

In order to demonstrate the use of Azure ML in forecasting we used the available data consisting of the following files and create a reduced dataset with **10 products of the 3 Texas stores of Walmart**. 

* **calendar.csv** - Contains information about the dates on which the products are sold.
* **sell_prices.csv** - Contains information about the price of the products sold per store and date.
* **sales_train_evaluation.csv** - Includes sales [d_1 - d_1941] (labels used for the Public leaderboard)

Details on how the new dataset was created can be seen in notebook [01-walmart_data_preparation](http://localhost:8888/notebooks/Capstone%20Project/notebooks/01-walmart_data_preparation.ipynb).


In [12]:
time_column_name = 'date'
data = pd.read_csv("./data/walmart_tx_stores_10_items_with_day.csv",parse_dates=[time_column_name])
# data = pd.read_csv("https://raw.githubusercontent.com/dpbac/Forecasting-Walmart-sales-with-Azure/master/data/walmart_tx_stores_10_items_with_day.csv?token=AEBB67N7Y3QIIH36FY5PBEDADK6WQ", parse_dates=[time_column_name])
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_TX,sell_price
0,HOBBIES_2_001_TX_1_evaluation,HOBBIES_2_001,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,,,,,0,
1,HOBBIES_2_002_TX_1_evaluation,HOBBIES_2_002,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,,,,,0,1.97
2,HOBBIES_2_003_TX_1_evaluation,HOBBIES_2_003,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,,,,,0,
3,HOBBIES_2_004_TX_1_evaluation,HOBBIES_2_004,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,,,,,0,
4,HOBBIES_2_005_TX_1_evaluation,HOBBIES_2_005,HOBBIES_2,HOBBIES,TX_1,TX,d_1,0,2011-01-29,11101,,,,,0,


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58230 entries, 0 to 58229
Data columns (total 16 columns):
id              58230 non-null object
item_id         58230 non-null object
dept_id         58230 non-null object
cat_id          58230 non-null object
store_id        58230 non-null object
state_id        58230 non-null object
day             58230 non-null object
demand          58230 non-null int64
date            58230 non-null datetime64[ns]
wm_yr_wk        58230 non-null int64
event_name_1    4740 non-null object
event_type_1    4740 non-null object
event_name_2    120 non-null object
event_type_2    120 non-null object
snap_TX         58230 non-null int64
sell_price      52938 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(11)
memory usage: 7.1+ MB


## Prepare Data

In [14]:
forecast_horizon = 28
gap = 0

data = create_features(data,forecast_horizon)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41988 entries, 10951 to 58229
Data columns (total 42 columns):
id                          41988 non-null category
item_id                     41988 non-null category
dept_id                     41988 non-null category
cat_id                      41988 non-null category
store_id                    41988 non-null category
state_id                    41988 non-null category
day                         41988 non-null category
demand                      41988 non-null int64
date                        41988 non-null datetime64[ns]
wm_yr_wk                    41988 non-null int64
event_name_1                41988 non-null category
event_type_1                41988 non-null category
event_name_2                41988 non-null category
event_type_2                41988 non-null category
snap_TX                     41988 non-null int64
sell_price                  41988 non-null float64
lag_t28                     41988 non-null float64
lag_t29 

In [16]:
# Create a training/testing split

df_train, df_test = split_train_test(data,forecast_horizon, gap)

# Separate features and labels
    
X_train=df_train.drop(['demand'],axis=1)
y_train=df_train['demand']
X_test=df_test.drop(['demand'],axis=1)
y_test=df_test['demand']
    
X_train.drop(columns='date',inplace=True)
X_test.drop(columns='date',inplace=True)

First day training dataset:2012-01-29 00:00:00
Last day training dataset:2016-04-24 00:00:00
First day test dataset:2016-04-25 00:00:00
Last day test dataset:2016-05-22 00:00:00


## Upload Data to Datastore

In [17]:
# save data locally
    
path_data = './data_walmart_tx.csv'
path_train = './train.csv'
path_test = './test.csv'

data.to_csv(path_data, index = None, header=True)
df_train.to_csv(path_train, index = None, header=True)
df_test.to_csv(path_test, index = None, header=True)

datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data_walmart_tx.csv','./train.csv', './test.csv'], 
                       target_path = 'dataset/', 
                       overwrite = True,
                       show_progress = True)

Uploading an estimated of 3 files
Uploading ./test.csv
Uploaded ./test.csv, 1 files out of an estimated total of 3
Uploading ./train.csv
Uploaded ./train.csv, 2 files out of an estimated total of 3
Uploading ./data_walmart_tx.csv
Uploaded ./data_walmart_tx.csv, 3 files out of an estimated total of 3
Uploaded 3 files


$AZUREML_DATAREFERENCE_d53c6dcf0ff747de84866d578042173a

In [18]:
# # save data locally

# path_data = './data_walmart_tx.csv'
# path_train = './train.csv'
# path_test = './test.csv'

# data.to_csv(path_data, index = None, header=True)
# df_train.to_csv(path_train, index = None, header=True)
# df_test.to_csv(path_test, index = None, header=True)

# datastore = ws.get_default_datastore()

# path_on_datastore = "dataset/"
# datastore.upload_files(files = ['./walmart_data_tx.csv','./train.csv', './test.csv'], 
#                        target_path = 'dataset/', 
#                        overwrite = True,
#                        show_progress = True)

In [19]:
print(
    "Datastore type: " + datastore.datastore_type,
    "Account name: " + datastore.account_name,
    "Container name: " + datastore.container_name,
    sep="\n",
)

Datastore type: AzureBlob
Account name: mlstrg137316
Container name: azureml-blobstore-7ac6038e-4c9d-4861-bb6c-c42d8ceb1c3c


In [20]:
# Get data reference object for the data path
ds_data = datastore.path('dataset/')
print(ds_data)

$AZUREML_DATAREFERENCE_f2df6451435947c2a8e535e81a9ffc33


In [21]:
type(ds_data.as_mount())

azureml.data.data_reference.DataReference

In [22]:
from azureml.core.dataset import Dataset

df_temp = Dataset.Tabular.from_delimited_files(path=datastore.path('dataset/train.csv'))
df_temp = df_temp.to_pandas_dataframe()

In [23]:
type(datastore.path('dataset/train.csv'))

azureml.data.data_reference.DataReference

In [24]:
df_temp.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,date,wm_yr_wk,...,day_of_week,week,month,year,is_month_start,is_month_end,is_weekend,lag_revenue_t1,rolling_revenue_std_t28,rolling_revenue_mean_t28
0,HOBBIES_2_002_TX_1_evaluation,HOBBIES_2_002,HOBBIES_2,HOBBIES,TX_1,TX,d_366,2,2012-01-29,11201,...,6,4,1,2012,0,0,1,0.0,1.86,0.63
1,HOBBIES_2_007_TX_1_evaluation,HOBBIES_2_007,HOBBIES_2,HOBBIES,TX_1,TX,d_366,0,2012-01-29,11201,...,6,4,1,2012,0,0,1,0.0,0.31,0.1
2,HOBBIES_2_009_TX_1_evaluation,HOBBIES_2_009,HOBBIES_2,HOBBIES,TX_1,TX,d_366,0,2012-01-29,11201,...,6,4,1,2012,0,0,1,0.0,7.39,3.39
3,HOBBIES_2_001_TX_2_evaluation,HOBBIES_2_001,HOBBIES_2,HOBBIES,TX_2,TX,d_366,0,2012-01-29,11201,...,6,4,1,2012,0,0,1,0.0,1.72,0.59
4,HOBBIES_2_002_TX_2_evaluation,HOBBIES_2_002,HOBBIES_2,HOBBIES,TX_2,TX,d_366,0,2012-01-29,11201,...,6,4,1,2012,0,0,1,0.0,2.76,2.04


In [25]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41148 entries, 0 to 41147
Data columns (total 42 columns):
id                          41148 non-null object
item_id                     41148 non-null object
dept_id                     41148 non-null object
cat_id                      41148 non-null object
store_id                    41148 non-null object
state_id                    41148 non-null object
day                         41148 non-null object
demand                      41148 non-null int64
date                        41148 non-null datetime64[ns]
wm_yr_wk                    41148 non-null int64
event_name_1                41148 non-null object
event_type_1                41148 non-null object
event_name_2                41148 non-null object
event_type_2                41148 non-null object
snap_TX                     41148 non-null int64
sell_price                  41148 non-null float64
lag_t28                     41148 non-null float64
lag_t29                     41148 

In [26]:
del df_temp

# Hyperdrive Configuration

Before configuring HyperDrive, we will check if the remote compute target is successfully created by submitting a job to the target. This compute target will be used by HyperDrive for hyperparameter tuning later.

In [27]:
# ds_data.as_mount()

In [28]:
# env

In [29]:
# # script_params = {"--data-folder": ds_data.as_mount(), "--bagging-fraction": 0.8}

# # create an estimator to specify details of the job

# est = Estimator( 
#     source_directory='./', # directory containing experiment configuration files (train.py)
#     compute_target=compute_target, # compute target where training will happen
#     entry_script='train_v050221.py',
#     use_docker=True,
#     script_params={"--data-folder": ds_data.as_mount(), "--bagging-fraction": 0.8},
#     environment_definition=env, #remove if there is an error
# )

# # Submit job to remote compute
# run_remote = experiment.submit(config=est)

In [30]:
# check the status of the job.
# RunDetails(run_remote).show()

In [31]:
# # Get validation metric value after the job finishes
# run_remote.wait_for_completion()
# run_remote.get_metrics()

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

**REVIEW AND EDIT**

Now we are ready to tune hyperparameters of the LightGBM forecast model by launching multiple runs on the cluster. In the following cell, we define the configuration of a HyperDrive job that does a parallel search of the hyperparameter space using a Bayesian sampling method. HyperDrive also supports random sampling of the parameter space.

It is recommended that the maximum number of runs should be greater than or equal to 20 times the number of hyperparameters being tuned, for best results with Bayesian sampling. Specifically, it should be no less than 180 in the following case as we have 9 hyperparameters to tune. Nevertheless, we find that even with a very small amount of runs Bayesian search can achieve decent performance. Thus, the maximum number of child runs of HyperDrive `max_total_runs` is set as `20` to reduce the running time.

## Tune Hyperparameters using HyperDrive

The following code tune hyperparameters for the LightGBM forecast model.

The code does a parallel search of the hyperparameter space using a `Bayesian sampling method` which does not support `termination policy`. Therefore, `policy=None`.

For Bayesian Sampling we recommend using a `maximum number of runs` greater than or equal to 20 times the number of hyperparameters being tuned. The recommendend value is 140. We set the maximum number of child runs of HyperDrive `max_total_runs` to `20` to reduce the running time. 





**If I have the time:**
Therefore, we set the maximum number of child runs of HyperDrive `max_total_runs` to `140`.



In [32]:
# Increase this value if you want to achieve better performance
max_total_runs = 20


est = Estimator( 
    source_directory='./', # directory containing experiment configuration files (train.py)
    compute_target=compute_target, # compute target where training will happen
    entry_script='train_v050221.py',
    use_docker=True,
    script_params={"--data-folder": ds_data.as_mount()},
    environment_definition=env, #remove if there is an error
)



# Specify hyperparameter space
param_sampling = BayesianParameterSampling(
    {
        "--num-leaves": quniform(8, 128, 1),
        "--min-data-in-leaf": quniform(20, 500, 10),
        "--learning-rate": choice(
            1e-4, 1e-3, 5e-3, 1e-2, 1.5e-2, 2e-2, 3e-2, 5e-2, 1e-1
        ),
        "--feature-fraction": uniform(0.2, 1),
        "--bagging-fraction": uniform(0.1, 1),
        "--bagging-freq": quniform(1, 20, 1),
        "--max-rounds": quniform(50, 2000, 10),
#         "--max-lag": quniform(3, 40, 1),
#         "--window-size": quniform(3, 40, 1),
    }
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.

hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=param_sampling,
    primary_metric_name='MAE',# mean_absolute_error
    primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
    max_total_runs=max_total_runs, 
    max_concurrent_runs=4,
    policy=None, #Bayesian sampling does not support early termination policies.
)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [33]:
# Submit hyperdrive run to the experiment 

hyperdrive_run = experiment.submit(config = hyperdrive_config)



## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [34]:
# Show run details with the Jupyter widget
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
hyperdrive_run.get_metrics()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_548e5166-7963-4ae4-b542-372e2cb63714
Web View: https://ml.azure.com/experiments/hyper-lgbm-walmart-forecasting/runs/HD_548e5166-7963-4ae4-b542-372e2cb63714?wsid=/subscriptions/2c48c51c-bd47-40d4-abbe-fb8eabd19c8c/resourcegroups/aml-quickstarts-137316/workspaces/quick-starts-ws-137316

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-05T14:54:13.245206][API][INFO]Experiment created<END>\n""<START>[2021-02-05T14:54:13.805004][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-05T14:54:14.126613][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-05T14:54:14.1895733Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_548e5166-7963-4ae4-b542-372e2cb63714
Web View: https://ml.azure.com/experiments/hyper-lgbm-walmart-forecasting/runs/HD_548e5166-7963-4ae4-b542-372e2cb6

{'HD_548e5166-7963-4ae4-b542-372e2cb63714_18': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_19': {'MAE': 26.90476190476238},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_17': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_15': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_16': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_13': {'MAE': 26.89167107394025},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_14': {'MAE': 26.904406467359514},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_11': {'MAE': 26.905350296710157},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_10': {'MAE': 26.912664721455815},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_12': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_8': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_9': {'MAE': 26.904761904761905},
 'HD_548e5166-7963-4ae4-b542-372e2cb63714_6': {'MAE': 26.904535831753577},
 'HD_548e5166-796

## Retrieve and Save Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [35]:
# Retrieve the best model and its hyperparameter values

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()["runDefinition"]["arguments"]


print('Best Run Id: ', best_run.id)
print('MAE:', best_run_metrics['MAE'])
print('Best model hyperparameter values', parameter_values)


Best Run Id:  HD_548e5166-7963-4ae4-b542-372e2cb63714_3
MAE: 26.785714285714285
Best model hyperparameter values ['--data-folder', '$AZUREML_DATAREFERENCE_f2df6451435947c2a8e535e81a9ffc33', '--num-leaves', '73', '--min-data-in-leaf', '370', '--learning-rate', '0.1', '--feature-fraction', '0.27889223304553756', '--bagging-fraction', '0.7171466404677583', '--bagging-freq', '13', '--max-rounds', '250']


In [36]:
# Save the best model
model = best_run.register_model(
    model_name="hd_lgbm_walmart_forecast", 
    model_path="./outputs/model",
    description='Best HyperDrive Walmart forecasting model'
)
print("Model successfully saved.")

Model successfully saved.


## Model Deployment

**REVIEW AND EDIT**

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

Now we are ready to deploy the model as a web service running in Azure Container Instance [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.

### Create score.py

First, we will create a scoring script that will be invoked by the web service call.

* Note that the scoring script must have two required functions, `init()` and `run(input_data)`.
    - In `init()` function, you typically load the model into a global object. This function is executed only once when the Docker container is started.
    - In `run(input_data)` function, the model is used to predict a value based on the input data. The input and output to run typically use JSON as serialization and de-serialization format but you are not limited to that.

In [38]:
%%writefile score.py
import os
import json
import numpy as np
import pandas as pd
import lightgbm as lgb


def init():
    global bst
    model_root = os.getenv("AZUREML_MODEL_DIR")
    # The name of the folder in which to look for LightGBM model files
    lgbm_model_folder = "model"
    bst = lgb.Booster(
        model_file=os.path.join(model_root, lgbm_model_folder, "best-model.txt")
    )


def run(raw_data):
    columns = bst.feature_name()
    data = np.array(json.loads(raw_data)["data"])
    test_df = pd.DataFrame(data=data, columns=columns)
    # Make prediction
    out = bst.predict(test_df)
    return out.tolist()

Writing score.py


### Create myenv.yml

We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify packages `numpy`, `pandas`, and `lightgbm`.

In [39]:
print(pd.__version__)
print(np.__version__)
print(lgb.__version__)

0.25.3
1.18.5
2.3.0


In [40]:
cd = CondaDependencies.create()
cd.add_conda_package("numpy=1.18.5")
cd.add_conda_package("pandas=0.25.3")
cd.add_conda_package("lightgbm=2.3.0")
cd.save_to_file(base_directory="./", conda_file_path="myenv.yml")

print(cd.serialize_to_string())

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-defaults~=1.20.0
- numpy=1.18.5
- pandas=0.25.3
- lightgbm=2.3.0
channels:
- anaconda
- conda-forge



### Deploy to ACI

We are almost ready to deploy. In the next cell, we first create the inference configuration and deployment configuration. Then, we deploy the model to ACI. This cell will run for several minutes.

In [41]:
best_run.get_environment()

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210104.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "Experiment hyper-lgbm-walmart-forecasting Environment",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "

In [42]:
%%time

inference_config = InferenceConfig(runtime="python", entry_script="score.py", conda_file="myenv.yml")

# inference_config = InferenceConfig(environment = best_run.get_environment(), 
#                                    entry_script = "score.py")

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 2, 
                                               auth_enabled=True, 
                                               enable_app_insights=True,
                                               tags = {'type': "hd-lgbm-forecasting"},
                                               description = "LightGBM model on Walmart Texas stores data")


# aciconfig = AciWebservice.deploy_configuration(cpu_cores=1,
#                                                memory_gb=2,
#                                                tags={"name": "walmart_tx_data", "framework": "LightGBM"},
#                                                description="LightGBM model on Walmart Texas stores data")

aci_service_name = 'hd-walmart-forecast-04'
service = Model.deploy(workspace=ws, 
                       name=aci_service_name, 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

service.wait_for_deployment(True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.............................................................................................................
Failed


ERROR:azureml.core.webservice.webservice:Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 6d2a8206-318f-4834-bea9-3cf9ee4c0b7d
More information can be found using '.get_logs()'
Error:
{
  "code": "AciDeploymentFailed",
  "message": "Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: hd-walmart-forecast-04. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\nYou can also try to run image 7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed locally. Please refer to https://aka.ms/debugimag

WebserviceException: WebserviceException:
	Message: Service deployment polling reached non-successful terminal state, current service state: Failed
Operation ID: 6d2a8206-318f-4834-bea9-3cf9ee4c0b7d
More information can be found using '.get_logs()'
Error:
{
  "code": "AciDeploymentFailed",
  "message": "Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: hd-walmart-forecast-04. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\nYou can also try to run image 7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed locally. Please refer to https://aka.ms/debugimage#service-launch-fails for more information.",
  "details": [
    {
      "code": "CrashLoopBackOff",
      "message": "Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: hd-walmart-forecast-04. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\nYou can also try to run image 7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed locally. Please refer to https://aka.ms/debugimage#service-launch-fails for more information."
    },
    {
      "code": "AciDeploymentFailed",
      "message": "Your container application crashed. Please follow the steps to debug:\n1. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. Please refer to https://aka.ms/debugimage#dockerlog for more information.\n2. If your container application crashed. This may be caused by errors in your scoring file's init() function. You can try debugging locally first. Please refer to https://aka.ms/debugimage#debug-locally for more information.\n3. You can also interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\n4. View the diagnostic events to check status of container, it may help you to debug the issue. {\"restartCount\":3,\"currentState\":{\"state\":\"Waiting\",\"startTime\":null,\"exitCode\":null,\"finishTime\":null,\"detailStatus\":\"CrashLoopBackOff: Back-off 40s restarting failed\"},\"previousState\":{\"state\":\"Terminated\",\"startTime\":\"2021-02-05T15:22:06Z\",\"exitCode\":111,\"finishTime\":\"2021-02-05T15:22:08Z\",\"detailStatus\":\"Error\"},\"events\":[{\"count\":2,\"firstTimestamp\":\"2021-02-05T15:19:38Z\",\"lastTimestamp\":\"2021-02-05T15:21:22Z\",\"name\":\"Pulling\",\"message\":\"pulling image \\\"7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed\\\"\",\"type\":\"Normal\"},{\"count\":2,\"firstTimestamp\":\"2021-02-05T15:21:16Z\",\"lastTimestamp\":\"2021-02-05T15:21:22Z\",\"name\":\"Pulled\",\"message\":\"Successfully pulled image \\\"7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed\\\"\",\"type\":\"Normal\"},{\"count\":2,\"firstTimestamp\":\"2021-02-05T15:21:16Z\",\"lastTimestamp\":\"2021-02-05T15:21:22Z\",\"name\":\"Created\",\"message\":\"Created container\",\"type\":\"Normal\"},{\"count\":1,\"firstTimestamp\":\"2021-02-05T15:21:16Z\",\"lastTimestamp\":\"2021-02-05T15:21:16Z\",\"name\":\"Started\",\"message\":\"Started container\",\"type\":\"Normal\"}]}"
    }
  ]
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Service deployment polling reached non-successful terminal state, current service state: Failed\nOperation ID: 6d2a8206-318f-4834-bea9-3cf9ee4c0b7d\nMore information can be found using '.get_logs()'\nError:\n{\n  \"code\": \"AciDeploymentFailed\",\n  \"message\": \"Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: hd-walmart-forecast-04. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\\nYou can also try to run image 7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed locally. Please refer to https://aka.ms/debugimage#service-launch-fails for more information.\",\n  \"details\": [\n    {\n      \"code\": \"CrashLoopBackOff\",\n      \"message\": \"Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: hd-walmart-forecast-04. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\\nYou can also try to run image 7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed locally. Please refer to https://aka.ms/debugimage#service-launch-fails for more information.\"\n    },\n    {\n      \"code\": \"AciDeploymentFailed\",\n      \"message\": \"Your container application crashed. Please follow the steps to debug:\\n1. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. Please refer to https://aka.ms/debugimage#dockerlog for more information.\\n2. If your container application crashed. This may be caused by errors in your scoring file's init() function. You can try debugging locally first. Please refer to https://aka.ms/debugimage#debug-locally for more information.\\n3. You can also interactively debug your scoring file locally. Please refer to https://docs.microsoft.com/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments for more information.\\n4. View the diagnostic events to check status of container, it may help you to debug the issue. {\\\"restartCount\\\":3,\\\"currentState\\\":{\\\"state\\\":\\\"Waiting\\\",\\\"startTime\\\":null,\\\"exitCode\\\":null,\\\"finishTime\\\":null,\\\"detailStatus\\\":\\\"CrashLoopBackOff: Back-off 40s restarting failed\\\"},\\\"previousState\\\":{\\\"state\\\":\\\"Terminated\\\",\\\"startTime\\\":\\\"2021-02-05T15:22:06Z\\\",\\\"exitCode\\\":111,\\\"finishTime\\\":\\\"2021-02-05T15:22:08Z\\\",\\\"detailStatus\\\":\\\"Error\\\"},\\\"events\\\":[{\\\"count\\\":2,\\\"firstTimestamp\\\":\\\"2021-02-05T15:19:38Z\\\",\\\"lastTimestamp\\\":\\\"2021-02-05T15:21:22Z\\\",\\\"name\\\":\\\"Pulling\\\",\\\"message\\\":\\\"pulling image \\\\\\\"7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed\\\\\\\"\\\",\\\"type\\\":\\\"Normal\\\"},{\\\"count\\\":2,\\\"firstTimestamp\\\":\\\"2021-02-05T15:21:16Z\\\",\\\"lastTimestamp\\\":\\\"2021-02-05T15:21:22Z\\\",\\\"name\\\":\\\"Pulled\\\",\\\"message\\\":\\\"Successfully pulled image \\\\\\\"7ac6038e4c9d4861bb6cc42d8ceb1c3c.azurecr.io/azureml/azureml_fe903c61cbd08fcf01c09b246bd411ed\\\\\\\"\\\",\\\"type\\\":\\\"Normal\\\"},{\\\"count\\\":2,\\\"firstTimestamp\\\":\\\"2021-02-05T15:21:16Z\\\",\\\"lastTimestamp\\\":\\\"2021-02-05T15:21:22Z\\\",\\\"name\\\":\\\"Created\\\",\\\"message\\\":\\\"Created container\\\",\\\"type\\\":\\\"Normal\\\"},{\\\"count\\\":1,\\\"firstTimestamp\\\":\\\"2021-02-05T15:21:16Z\\\",\\\"lastTimestamp\\\":\\\"2021-02-05T15:21:16Z\\\",\\\"name\\\":\\\"Started\\\",\\\"message\\\":\\\"Started container\\\",\\\"type\\\":\\\"Normal\\\"}]}\"\n    }\n  ]\n}"
    }
}

In [43]:
print(service.get_logs())

2021-02-05T15:22:06,721075333+00:00 - gunicorn/run 
2021-02-05T15:22:06,721381334+00:00 - iot-server/run 
2021-02-05T15:22:06,721464734+00:00 - rsyslog/run 
2021-02-05T15:22:06,733445967+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_870d9ca4f210a7fb9d1d542842cbdba0/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_870d9ca4f210a7fb9d1d542842cbdba0/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_870d9ca4f210a7fb9d1d542842cbdba0/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_870d9ca4f210a7fb9d1d542842cbdba0/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_870d9ca4f210a7fb9d1d542842cbdba0/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
EdgeHubC

In [44]:
print("Deployment state: " + service.state)
print("Scoring URI: " + service.scoring_uri)
print("Authetication Key: " + service.get_keys()[0])
print("Swagger URI: " + service.swagger_uri)

Deployment state: Failed


TypeError: must be str, not NoneType

### Test the deployed model

Let's test the deployed model. We create a few test data points and send them to the web service hosted in ACI. Note here we are using the run API in the SDK to invoke the service. You can also make raw HTTP calls using any HTTP tool such as curl.

After the invocation, we print the returned predictions each of which represents the forecasted sales of a target store, brand in a given week as specified by `store, brand, week` in `used_columns`.

In [None]:
# test features (28 days)
X_test.reset_index(drop=True, inplace=True)
X_test

In [None]:
# 28 days in the test features dataset
X_test['day'].unique()

In [None]:
# features
X_test.columns

In [None]:
y_test.reset_index(drop=True, inplace = True)
y_test

In [None]:
# Pick a few test data points
test_samples = json.dumps({"data": np.array(X_test.iloc[:3]).tolist()})
test_samples = bytes(test_samples, encoding="utf8")
test_samples

In [None]:
# Predict using the deployed model
result = service.run(input_data=test_samples)
print("prediction:", result)

We can also send raw HTTP request to the service.

In [None]:
headers = {"Content-Type": "application/json"}

resp = requests.post(service.scoring_uri, test_samples, headers=headers)

print("POST to url", service.scoring_uri)
print("")
print("input data:", test_samples)
print("")
print("prediction:", resp.text)

TODO: In the cell below, print the logs of the web service and delete the service

## Clean up

After finishing the tests, you can delete the ACI deployment with a simple delete API call as follows.

In [None]:
service.get_logs()

In [None]:
service.delete()

# ONNX model

## Retrieve and save the best ONNX model

In [60]:
#Retrieve and save the best model

best_run, onnx_model = hyperdrive_run.get_output(return_onnx_model=True)
onnx_model_path = "results/best_model.onnx"
OnnxConverter.save_onnx_model(onnx_model, onnx_model_path)

AttributeError: 'HyperDriveRun' object has no attribute 'get_output'

## Predict with the ONNX model

In [None]:
if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    test_df = test_data.to_pandas_dataframe()
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(test_df)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Use Python version 3.6 or 3.7 to run the inference helper.')