# ML Workspace set-up

In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset

import logging


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Create Workspace object for the existing ML Workspace, from the configuration file `config.json`.

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-137658
aml-quickstarts-137658
southcentralus
2c48c51c-bd47-40d4-abbe-fb8eabd19c8c


## Create a *new* ML Experiment in the ML Workspace.

In [3]:
experiment_name = 'azure-nd-project-capstone'
project_folder = './automl-run-capstone-project'

experiment = Experiment(ws, experiment_name)

## Create a new cluster for model training.
If the cluster with the specified name already exists, use it.
The desired model is based on Regression analysis, without usage of Deep 

In [4]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

#cluster_name = "cluster-nd-capstone"
cluster_name = "auto-ml"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4, min_nodes=1)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count= None, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Upload and register the dataset

### Clean-up the dataset
Since we are comparing the results of the AutoML run and the HyperDrive run, it's better to work on the same data set, to be able to compare the performance of each model without the influence of feature engineering. So we are cleaning the dataset using a python script.

In [None]:
from scripts.cleansing import clean_data
import pandas as pd

def get_cleaned_dataset():
    found = False
    ds_key = "openfoodfacts"
    description_text = "Data extracted from OpenFoodFacts open source database."

    if ds_key in ws.datasets.keys(): 
        found = True
        ds_cleaned = ws.datasets[ds_key] 

    # Otherwise, create it from the file
    if not found:
        #Reading a json lines file into a DataFrame
        data = pd.read_json('./eda/foods-features-v3.json', lines=True)
        # DataFrame with cleaned data
        data_cleaned = clean_data(data)
        exported_df = 'cleaned-openfoodfacts.parquet'
        cleaned_data.to_parquet(exported_df);
        # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once
        ds_cleaned = TabularDatasetFactory.register_pandas_dataframe(dataframe=cleaned_data,
                                                                     target=(ws.get_default_datastore(), exported_df),
                                                                     name=ds_key, description=description_text,
                                                                     show_progress=True)
    return ds_cleaned


In [14]:
# get the datastore to upload prepared data
blob_store = ws.get_default_datastore()
# upload the files to default datastore, DataReference
blob_store.upload_files(#files=json_files,
                       files=['./eda/foods-features-v3.json'], 
                       target_path='capstone', relative_root='eda',
                       show_progress=True)

Uploading an estimated of 1 files
Uploading ./eda/foods-features-10000-str.json
Uploaded ./eda/foods-features-10000-str.json, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_1f736f1fbe9a401a9fff1c01141805a6

In [17]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "openfoodfacts"
description_text = "Data extracted from OpenFoodFacts open source database."


if key in ws.datasets.keys(): 
        found = True
        print(f'Dataset {key} found, use it.')
        dataset = ws.datasets[key] 

if not found:
# Create tabular dataset from the local JSON file
blob_file = blob_store.path('capstone/foods-features-v3.json')
dataset = TabularDatasetFactory.from_json_lines_files(path=blob_file)      
# Register Dataset in Workspace
dataset = dataset.register(workspace=ws,
                           name=key,
                           description=description_text,
                           create_new_version=True)


In [None]:
# create pandas dataframe
df = dataset.to_pandas_dataframe()

In [None]:
# explore data - take first 5 elements
df.head(5)

In [18]:
# obtain data statistics for each column
df.describe(include='all')


Unnamed: 0,_id,additives_n,nutriscore_grade,popularity_key,salt_level,fat_level,saturated_fat_level,sugar_level,vegan,vegeterian,palm_oil,brand,packagin_shape,packagin_material,serving_quantity_g,categories_list
0,8710908960864,1,d,119999900233,high,moderate,moderate,moderate,False,False,False,knorr,brick,cardboard,100.0,"plant-based-foods-and-beverages, plant-based-f..."
1,5413548283128,3,e,119999000228,low,high,high,high,False,False,True,kinder,,pp-polypropylene,5.0,"snacks, sweet-snacks"
2,3274080005003,0,a,19999996494,low,low,low,low,True,False,False,cristaline,bottle,pet-polyethylene-terephthalate,,"beverages, waters"
3,7622210449283,3,c,19999994551,low,moderate,low,moderate,False,False,True,lu,film,plastic,20.0,"snacks, sweet-snacks"
4,3017620422003,1,e,19999993283,low,high,high,high,False,False,True,ferrero,jar,glass,15.0,"spreads, breakfasts"


In [None]:
# distribution of data for each column , devided in 3 bins
hist = df.hist(bins=3)

## AutoML run

In [None]:
dataset = get_cleaned_dataset()

In [9]:
import shutil
import os

""" Creates(if doesn't exist) a new folder with 'folder_name' under 'project_folder'
    and copies 'file_names_to_copy' from the 'project_folder' into 'folder_name'
"""
def create_folder(project_folder, folder_name, *file_names_to_copy):
    new_folder = os.path.join(project_folder, folder_name)
    os.makedirs(new_folder, exist_ok=True)

    if file_names_to_copy:
        for name in file_names_to_copy:
            shutil.copy(name, new_folder)
    
    return new_folder

In [19]:
from azureml.train.automl import AutoMLConfig
auto_ml_directory_name = 'auto_ml_run'

auto_ml_directory = create_folder(project_folder, auto_ml_directory_name)

automl_settings = {
    "experiment_timeout_minutes": 20, #15 minutes is the minimum
    "enable_early_stopping": True,
    "primary_metric": 'r2_score', # the same as hyperdrive
    "featurization": 'auto',
    "verbosity": logging.DEBUG,
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             max_concurrent_iterations=3, #4 nodes
                             task= "regression",
                             training_data=dataset,
                             label_column_name="popularity_key",
                             debug_log = "automl_errors.log",
                             path = auto_ml_directory,
                             enable_onnx_compatible_models=True,
                             **automl_settings
                            )

In [20]:
auto_ml_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on auto-ml with default configuration
Running on remote compute: auto-ml
Parent Run ID: AutoML_f9cbc5c2-03b1-4f55-bdce-6ce77f90673b

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
              
DETAILS:      
+---------------------------------+
|Number of folds                  |
|3                                |
+---------------------------------+

****************************************************************************************************

TYPE:         Missing feature values imputation


        45   SparseNormalizer GradientBoosting              0:01:17       0.0086    0.0083
        44   MaxAbsScaler ExtremeRandomTrees                0:01:42       0.0090    0.0083
        46   MaxAbsScaler RandomForest                      0:01:34       0.0086    0.0083
        48   MaxAbsScaler RandomForest                      0:01:13       0.0086    0.0083
        47   MaxAbsScaler RandomForest                      0:01:36       0.0085    0.0083
        49   MaxAbsScaler RandomForest                      0:01:05       0.0084    0.0083
        50   MaxAbsScaler GradientBoosting                  0:01:06       0.0083    0.0083
        51   MaxAbsScaler ExtremeRandomTrees                0:01:29       0.0086    0.0083
        52   MaxAbsScaler ExtremeRandomTrees                0:01:12       0.0086    0.0083
        53   MaxAbsScaler ExtremeRandomTrees                0:01:17       0.0092    0.0083
        54   MaxAbsScaler LightGBM                          0:01:05       0.0088    0.0083

In [21]:
auto_ml_run.wait_for_completion()

{'runId': 'AutoML_f9cbc5c2-03b1-4f55-bdce-6ce77f90673b',
 'target': 'auto-ml',
 'status': 'Completed',
 'startTimeUtc': '2021-02-06T19:26:41.526021Z',
 'endTimeUtc': '2021-02-06T20:15:49.81895Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_root_mean_squared_error',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'auto-ml',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"6b03bd03-875b-4e52-b877-599eedd2d1c0\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"capstone/foods-features-10000-str.json\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137658\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"2c48c51c-bd47-40d4-

## Examine Results


In [22]:
# Retrieve the best run and the fitted model. 
best_run, fitted_model = auto_ml_run.get_output()

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


### Examine performed featurization

In [25]:
import pandas as pd
# Retrieve automatic featurization details (FeaturizationInfoProvider):
#  'datatransformer' - for regression and classigication
#  'timeseriestransformer' - for forecasting
featurizer = fitted_model.named_steps['datatransformer']
#is_user_friendly=False to get more detailed formation
featurization_summary = featurizer.get_featurization_summary(is_user_friendly=False)
pd.DataFrame(data=featurization_summary)

Unnamed: 0,RawFeatureName,TypeDetected,Dropped,EngineeredFeatureCount,Transformations,TransformationParams
0,_id,Hashes,Yes,0,[],"{'Transformer1': {'Input': ['_id'], 'Transform..."
1,additives_n,Categorical,No,18,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['additives_n'], 'T..."
2,nutriscore_grade,Categorical,No,5,[StringCast-CharGramCountVectorizer],{'Transformer1': {'Input': ['nutriscore_grade'...
3,salt_level,Categorical,No,4,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['salt_level'], 'Tr..."
4,fat_level,Categorical,No,4,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['fat_level'], 'Tra..."
5,saturated_fat_level,Categorical,No,4,[StringCast-CharGramCountVectorizer],{'Transformer1': {'Input': ['saturated_fat_lev...
6,sugar_level,Categorical,No,4,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['sugar_level'], 'T..."
7,vegan,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['vegan'], 'Transfo..."
8,palm_oil,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['palm_oil'], 'Tran..."
9,packagin_shape,Categorical,No,29,[StringCast-CharGramCountVectorizer],"{'Transformer1': {'Input': ['packagin_shape'],..."


In [27]:
# Retrieve column stats and feature type summary
stats_n_ft_summary = featurizer.get_stats_feature_type_summary()
pd.DataFrame(data=stats_n_ft_summary)

In [28]:
from pprint import pprint

# Helper function copied from Azure tutorial 
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features#scaling-and-normalization
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()



In [29]:
print('Model step details:\n')
print_model(fitted_model)

Model step details:

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

MaxAbsScaler
{'copy': True}

GradientBoostingRegressor
{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'huber',
 'max_depth': 1,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 0.15874989977926784,
 'min_samples_split': 0.02180025323490051,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 0.7999999999999999,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}



### Retrieve the best model in ONNX format

In [None]:
# Retrieve the best model in ONNX format
best_run, onnx_model = auto_ml_run.get_output(return_onnx_model=True)

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter
# Save the model localy (where the notebook is running)
#onnx_file_path = "./best_model_openfoodfacts.onnx"
#OnnxConverter.save_onnx_model(onnx_model, onnx_file_path)
best_run.download_file('outputs/model.onnx')

### Register the Best Model

In [None]:

# Register the folder (and all files in it) as a model named 'best-model-automl-bankmarketing' under the workspace
model_automl = best_run.register_model(model_name='best-model-openfoodfacts-onnx',
                                    model_path='outputs/model.onnx',
                                    sample_input_dataset = dataset,
                                    model_framework=Model.Framework.ONNX, # Framework used to create the model.
                                    model_framework_version='1.3',      # Version of ONNX used to create the model.
                                    description='Onnx openfoodfacts model')





### Deploy the best model

In [None]:
service_name = 'onnx-openfoodfacts-service'
service = Model.deploy(ws, service_name, [model_automl])

In [None]:
from azureml.core.webservice import Webservice, AciWebservice
from azureml.core.model import Model

print("Prepare ACI deployment configuration")
# Enable application insights
config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                            memory_gb = 1,
                                            enable_app_insights=True,
                                            auth_enabled=True)


In [None]:
from azureml.core import Environment
from azureml.core.model import InferenceConfig

# get the 'AzureML-AutoML' curated environment used to create the model
env = Environment.get(workspace=ws, name="AzureML-AutoML")
# creating inference configuration with the scoring file generated by AutoML run
inference_config = InferenceConfig(entry_script=auto_ml_directory + '/outputs/scoring_file_v_1_0_0.py',                                               environment=env)

In [None]:
print("Deploy the model to ACI")
# deploying to ACI using default environment and the generated  scoring script
# Model.get_model('best-model-automl-bankmarketing')
service = Model.deploy(workspace=ws, name = 'best-model-service', models=[model_automl],
                       overwrite=True, deployment_config=config, inference_config=inference_config)



In [None]:
service.wait_for_deployment(show_output = True)
print(service.state)

In [None]:
print(f"Scoring URI: {service.scoring_uri}")
print(f"Swagger URI: {service.swagger_uri}")

In [None]:
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)