# ML Workspace set-up

In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset

import logging


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Create Workspace object for the existing ML Workspace, from the configuration file `config.json`.

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-138687
aml-quickstarts-138687
southcentralus
81cefad3-d2c9-4f77-a466-99a7f541c7bb


## Create a *new* ML Experiment in the ML Workspace.

In [3]:
experiment_name = 'azure-nd-project-capstone'
project_folder = './automl-run-capstone-project'

experiment = Experiment(ws, experiment_name)

## Create a new cluster for model training.
If the cluster with the specified name already exists, use it.
The desired model is based on Regression analysis, without usage of Deep 

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cluster-nd-cs"
#cluster_name = "auto-ml"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4, min_nodes=1)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
# For a more detailed view of current AmlCompute status, use get_status().

Creating
Succeeded..................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Upload and register the dataset

### Clean-up the dataset
Since we are comparing the results of the AutoML run and the HyperDrive run, it's better to work on the same data set, to be able to compare the performance of each model without the influence of feature engineering. So we are cleaning the dataset using a python script.

In [7]:
from register_ds import get_cleaned_dataset

dataset = get_cleaned_dataset(ws)

Rows*columns=209*38


Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to cleaned-machine-cpu.parquet/c800cb47-530b-4324-bec0-9bd028324f1e/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [8]:
# create pandas dataframe
df = dataset.to_pandas_dataframe()

In [9]:
# explore data - take first 5 elements
df.head(5)

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP,vendor_adviser,vendor_amdahl,...,vendor_microdata,vendor_nas,vendor_ncr,vendor_nixdorf,vendor_perkin-elmer,vendor_prime,vendor_siemens,vendor_sperry,vendor_sratus,vendor_wang
0,125,256,6000,256,16,128,198,199,1,0,...,0,0,0,0,0,0,0,0,0,0
1,29,8000,32000,32,8,32,269,253,0,1,...,0,0,0,0,0,0,0,0,0,0
2,29,8000,32000,32,8,32,220,253,0,1,...,0,0,0,0,0,0,0,0,0,0
3,29,8000,32000,32,8,32,172,253,0,1,...,0,0,0,0,0,0,0,0,0,0
4,29,8000,16000,32,8,16,132,132,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# obtain data statistics for each column
df.describe(include='all')


Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP,vendor_adviser,vendor_amdahl,...,vendor_microdata,vendor_nas,vendor_ncr,vendor_nixdorf,vendor_perkin-elmer,vendor_prime,vendor_siemens,vendor_sperry,vendor_sratus,vendor_wang
count,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,...,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0
mean,203.822967,2867.980861,11796.15311,25.205742,4.698565,18.267943,105.62201,99.330144,0.004785,0.043062,...,0.004785,0.090909,0.062201,0.014354,0.014354,0.023923,0.057416,0.062201,0.004785,0.009569
std,260.262926,3878.742758,11726.564377,40.628722,6.816274,25.997318,160.830733,154.757102,0.069171,0.203485,...,0.069171,0.28817,0.2421,0.119231,0.119231,0.153178,0.233195,0.2421,0.069171,0.097588
min,17.0,64.0,64.0,0.0,0.0,0.0,6.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,768.0,4000.0,0.0,1.0,5.0,27.0,28.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,110.0,2000.0,8000.0,8.0,2.0,8.0,50.0,45.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,225.0,4000.0,16000.0,32.0,6.0,24.0,113.0,101.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1500.0,32000.0,64000.0,256.0,52.0,176.0,1150.0,1238.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# distribution of data for each column , devided in 3 bins
hist = df.hist(bins=3)

## AutoML run

In [12]:
import shutil
import os

""" Creates(if doesn't exist) a new folder with 'folder_name' under 'project_folder'
    and copies 'file_names_to_copy' from the 'project_folder' into 'folder_name'
"""
def create_folder(project_folder, folder_name, *file_names_to_copy):
    new_folder = os.path.join(project_folder, folder_name)
    os.makedirs(new_folder, exist_ok=True)

    if file_names_to_copy:
        for name in file_names_to_copy:
            shutil.copy(name, new_folder)
    
    return new_folder

In [13]:
from azureml.train.automl import AutoMLConfig
auto_ml_directory_name = 'auto_ml_run'

auto_ml_directory = create_folder(project_folder, auto_ml_directory_name)

automl_settings = {
    "experiment_timeout_minutes": 60, 
    "iteration_timeout_minutes": 15, #15 minutes is the minimum
    "enable_early_stopping": True,
    "primary_metric": 'r2_score', # the same as hyperdrive
    "featurization": 'auto',
    "verbosity": logging.DEBUG,
    "allowed_models": ['ElasticNet', 'XGBoostRegressor', 'LassoLars', 'GradientBoosting'], #locking goog-performing models
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             max_concurrent_iterations=3, #4 nodes
                             task= "regression",
                             training_data=dataset,
                             label_column_name="ERP",
                             debug_log = "automl_errors.log",
                             path = auto_ml_directory,
                             enable_onnx_compatible_models=True,
                             model_explainability=True,
                             
                             **automl_settings
                            )

In [14]:
auto_ml_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on cluster-nd-cs with default configuration
Running on remote compute: cluster-nd-cs
Parent Run ID: AutoML_4f008fdc-b67a-48d7-b347-93ce8bba9536

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high card

In [None]:
auto_ml_run.wait_for_completion()

In [15]:
from azureml.widgets import RunDetails
RunDetails(auto_ml_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Examine Results


In [16]:
# Retrieve the best run and the fitted model. 
best_run, fitted_model = auto_ml_run.get_output()

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


### Examine performed featurization

In [17]:
import pandas as pd
# Retrieve automatic featurization details (FeaturizationInfoProvider):
#  'datatransformer' - for regression and classigication
#  'timeseriestransformer' - for forecasting
featurizer = fitted_model.named_steps['datatransformer']
#is_user_friendly=False to get more detailed formation
featurization_summary = featurizer.get_featurization_summary(is_user_friendly=False)
pd.DataFrame(data=featurization_summary)

Unnamed: 0,RawFeatureName,TypeDetected,Dropped,EngineeredFeatureCount,Transformations,TransformationParams
0,MYCT,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['MYCT'], 'Transfor..."
1,MMIN,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['MMIN'], 'Transfor..."
2,MMAX,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['MMAX'], 'Transfor..."
3,CACH,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['CACH'], 'Transfor..."
4,CHMIN,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['CHMIN'], 'Transfo..."
5,CHMAX,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['CHMAX'], 'Transfo..."
6,PRP,Numeric,No,1,[MeanImputer],"{'Transformer1': {'Input': ['PRP'], 'Transform..."
7,vendor_adviser,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['vendor_adviser'],..."
8,vendor_amdahl,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['vendor_amdahl'], ..."
9,vendor_apollo,Categorical,No,1,[ModeCatImputer-StringCast-LabelEncoder],"{'Transformer1': {'Input': ['vendor_apollo'], ..."


In [None]:
# Retrieve column stats and feature type summary
stats_n_ft_summary = featurizer.get_stats_feature_type_summary()
pd.DataFrame(data=stats_n_ft_summary)

In [18]:
from pprint import pprint

# Helper function copied from Azure tutorial 
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features#scaling-and-normalization
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()



In [19]:
print('Model step details:\n')
print_model(fitted_model)

Model step details:

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingregressor
{'estimators': [('33',
                 Pipeline(memory=None,
         steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('extratreesregressor',
                 ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                     criterion='mse', max_depth=None,
                                     max_features=0.5, max_leaf_nodes=None,
                                     max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_s

### Retrieve the best model in ONNX/PKL format

In [None]:
# Retrieve the best model in ONNX format
best_run, onnx_model = auto_ml_run.get_output(return_onnx_model=True)

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter
# Save the model localy (where the notebook is running)
#onnx_file_path = "./best_model_openfoodfacts.onnx"
#OnnxConverter.save_onnx_model(onnx_model, onnx_file_path)
best_run.download_file('outputs/model.onnx')

In [None]:
print(onnx_model)

In [None]:
best_run, pkl_model = auto_ml_run.get_output()

### Register the Best Model

In [None]:
from azureml.core.model import Model
# Register the folder (and all files in it) as a model named 'best-model-machine-cpu-automl-onnx' under the workspace
model_automl_onnx = best_run.register_model(model_name='best-model-machine-cpu-automl-onnx',
                                    model_path='outputs/model.onnx',
                                    sample_input_dataset = dataset,
                                    model_framework=Model.Framework.ONNX, # Framework used to create the model.
                                    model_framework_version='1.3',      # Version of ONNX used to create the model.
                                    description='Onnx machine-cpu model')


In [None]:
from azureml.core.model import Model
# Register the folder (and all files in it) as a model named 'best-model-machine-cpu-automl' under the workspace
model_automl_pkl = best_run.register_model(model_name='best-model-machine-cpu-automl',
                                    model_path='outputs/model.pkl',
                                    sample_input_dataset = dataset,                                    
                                    description='Machine-cpu model')

In [None]:
best_run.download_files(prefix='outputs', output_directory=auto_ml_directory)