# Automated ML

# Dependencies

All the dependencies needed to complete the project appear here


In [35]:

import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

from azureml.pipeline.steps import AutoMLStep

from azureml.widgets import RunDetails

import joblib

from azureml.core.environment import Environment 
from azureml.core.model import InferenceConfig 
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.22.0


# Workspace
The config.json file is downloaded from Azure environment and has to be in the project folder in order for this cell to run

In [32]:
ws = Workspace.get(name="quick-starts-ws-139954",
               subscription_id='f9d5a085-54dc-4215-9ba6-dad5d86e60a0',
               resource_group='aml-quickstarts-139954')
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-139954
aml-quickstarts-139954
southcentralus
f9d5a085-54dc-4215-9ba6-dad5d86e60a0


# Create an Azure ML experiment

I am creating an experiment named machine-failure-experiment and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the source_directory for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the source_directory would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the source_directory of the step.

In [3]:

# Choose a name for the run history container in the workspace.

experiment_name = 'machine-failure-prediction'
project_folder = './capstone-project'

experiment = Experiment(ws, experiment_name)
experiment

run = experiment.start_logging()

# Create or Attach a cluster

We will need to create a compute target for the AutoML run. In case the compute target (named compute-cluster in this script) is not found, a new one is created using the default AmlCompute as the training compute resource

#Configuring the Compute Cluster

In [4]:

# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    # Poll for a minimum number of nodes (min_nodes = 1). 
    # If no min node count is provided it uses the scale settings for the cluster.
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', min_nodes=1, max_nodes=7)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-06T17:18:15.888000+00:00', 'errors': None, 'creationTime': '2021-03-06T17:16:37.261588+00:00', 'modifiedTime': '2021-03-06T17:16:52.718986+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 1, 'maxNodeCount': 8, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


## Dataset

### Overview
The dataset is taken from kaggle : "https://www.kaggle.com/c/machine-failure-prediction/data?select=train.csv".



In [12]:

data = pd.read_csv('./train.csv')

found = False
key = "machine-failure-dataset"
description_text = "Prediction of machine failure using various parameters"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        my_dataset = 'https://www.kaggle.com/c/machine-failure-prediction/data?select=train.csv'
        dataset = Dataset.Tabular.from_delimited_files(my_dataset)        
        # Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)
                                
# Preview of the first five rows
print(data.head())

# Explore data
print(data.describe())

df = dataset.to_pandas_dataframe()
df.describe()

# Data columns
df.columns = ['ID', 'Date', 'Temperature', 'Humidity','Operator','Measure1', 'Measure2','Measure3','Measure4','Measure5','Measure6','Measure7','Measure8','Measure9','Measure10','Measure11','Measure12','Measure13','Measure14','Measure15','Hours Since Previous Failure','Failure','?Date.year','?Date.month','?Date.day-of-month','?Date.day-of-week','?Date.hour','?Date.minute','?Date.second']
x = df[['ID', 'Date', 'Temperature', 'Humidity','Operator', 'Measure1', 'Measure2','Measure3','Measure4','Measure5','Measure6','Measure7','Measure8','Measure9','Measure10','Measure11','Measure12','Measure13','Measure14','Measure15','Hours Since Previous Failure','?Date.year','?Date.month','?Date.day-of-month','?Date.day-of-week','?Date.hour','?Date.minute','?Date.second']]
y = df[['Failure']]

   ID              Date  Temperature  Humidity   Operator  Measure1  Measure2  \
0   1  01-01-2016 00:00           67        82  Operator1       291         1   
1   2  01-01-2016 01:00           68        77  Operator1      1180         1   
2   3  01-01-2016 02:00           64        76  Operator1      1406         1   
3   4  01-01-2016 03:00           63        80  Operator1       550         1   
4   5  01-01-2016 04:00           65        81  Operator1      1928         1   

   Measure3  Measure4  Measure5  ...  Measure15  Hours Since Previous Failure  \
0         1      1041       846  ...       1842                            90   
1         1      1915      1194  ...        748                            91   
2         1       511      1577  ...       1689                            92   
3         1      1754      1834  ...        711                            93   
4         2      1326      1082  ...        507                            94   

   Failure  ?Date.year  ?D

## AutoML Configuration


AutoML Configuration
Here is an overview of the automl settings and configuration I used for the AutoML run:

"n_cross_validations": 2

This parameter sets how many cross validations to perform, based on the same number of folds (number of subsets). As one cross-validation could result in overfit, in my code I chose 2 folds for cross-validation; thus the metrics are calculated with the average of the 2 validation metrics.

"primary_metric": 'accuracy'

I chose accuracy as the primary metric as it is the default metric used for classification tasks.

"enable_early_stopping": True

It defines to enable early termination if the score is not improving in the short term. In this experiment, it could also be omitted because the _experiment_timeoutminutes is already defined below.

"max_concurrent_iterations": 4

It represents the maximum number of iterations that would be executed in parallel.

"experiment_timeout_minutes": 20

This is an exit criterion and is used to define how long, in minutes, the experiment should continue to run. To help avoid experiment time out failures, I used the value of 20 minutes.

"verbosity": logging.INFO

The verbosity level for writing to the log file.

compute_target = compute_target

The Azure Machine Learning compute target to run the Automated Machine Learning experiment on.

task = 'classification'

This defines the experiment type which in this case is classification. Other options are regression and forecasting.

training_data = dataset

The training data to be used within the experiment. It should contain both training features and a label column - see next parameter.

label_column_name = 'DEATH_EVENT'

The name of the label column i.e. the target column based on which the prediction is done.

path = project_folder

The full path to the Azure Machine Learning project folder.

featurization = 'auto'

This parameter defines whether featurization step should be done automatically as in this case (auto) or not (off).

debug_log = 'automl_errors.log

The log file to write debug information to.

enable_onnx_compatible_models = False

I chose not to enable enforcing the ONNX-compatible models at this stage. However, I will try it in the future. For more info on Open Neural Network Exchange (ONNX), please see here.

In [37]:

# Automl settings

automl_settings = {"n_cross_validations": 2,
                    "primary_metric": 'accuracy',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 4,
                    "experiment_timeout_minutes": 20,
                    "verbosity": logging.INFO
                    }

# Parameters for AutoMLConfig

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            training_data=dataset,
                            label_column_name='Failure',
                            path = project_folder,
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=False,
                            blocked_models=['XGBoostClassifier'],
                            **automl_settings
                            )

In [38]:

# Submit the experiment

remote_run = experiment.submit(automl_config, show_output = True)
remote_run.wait_for_completion()

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a fal

{'runId': 'AutoML_3b267046-c349-4020-a32f-ceb49235dc7e',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-03-06T19:11:03.5873Z',
 'endTimeUtc': '2021-03-06T19:26:54.894778Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"0ce5c253-4931-442f-9429-895e8af1de39\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/03-06-2021_051746_UTC/train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-139954\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f9d5a085-54dc-4215-9ba6-dad5d86e60a0\\\\\\"

## Run Details


In [39]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [40]:
# Fetch the latest status of the run. It should show 'Completed'

print("Run Status: ",remote_run.get_status())

Run Status:  Completed


# Run Details
In the cell below, I use the RunDetails widget and show the children runs of the experiment.

In [41]:
RunDetails(remote_run).show()

# Get details from each run
for child_run in remote_run.get_children():
    print('===================================================')
    print(child_run)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_37,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_36,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_35,
Type: azureml.scriptrun,
Status: Canceled)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_34,
Type: azureml.scriptrun,
Status: Canceled)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_33,
Type: azureml.scriptrun,
Status: Canceled)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_32,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_31,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: machine

## Best Model




In [42]:


automl_best_run, fitted_automl_model = remote_run.get_output() 
print("Fitted model: ", fitted_automl_model.steps[-1])
print("---"*5)
print("AutoML best run accuracy: ", automl_best_run.get_metrics(name="accuracy"))
print("---"*5)
print("AutoML run Summary: ", remote_run.summary())

Package:azureml-automl-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-core, training version:1.23.0, current version:1.22.0
Package:azureml-dataprep, training version:2.10.1, current version:2.9.1
Package:azureml-dataprep-native, training version:30.0.0, current version:29.0.0
Package:azureml-dataprep-rslex, training version:1.8.0, current version:1.7.0
Package:azureml-dataset-runtime, training version:1.23.0, current version:1.22.0
Package:azureml-defaults, training version:1.23.0, current version:1.22.0
Package:azureml-interpret, training version:1.23.0, current version:1.22.0
Package:azureml-mlflow, training version:1.23.0, current version:1.22.0
Package:azureml-pipeline-core, training version:1.23.0, current version:1.22.0
Package:azureml-telemetry, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-client, training version:1.23.0, current version:1.22.0
Package:azureml-train-automl-runtime, training version:1.23.0, current versio

Fitted model:  ('SGDClassifierWrapper', SGDClassifierWrapper(alpha=5.3061693877551015, class_weight=None, eta0=0.01,
                     fit_intercept=False, l1_ratio=0.7959183673469387,
                     learning_rate='invscaling', loss='modified_huber',
                     max_iter=1000, n_jobs=1, penalty='none', power_t=0,
                     random_state=None, tol=0.0001))
---------------
AutoML best run accuracy:  {'accuracy': 0.9940541558146275}
---------------
AutoML run Summary:  [['StackEnsemble', 1, 0.9926627114547348], ['VotingEnsemble', 1, 0.993927637596004], ['Failed', 2, nan], ['ExtremeRandomTrees', 7, 0.9905120617662391], ['SGD', 3, 0.9940541558146275], ['LightGBM', 14, 0.9917771479356119], ['LogisticRegression', 3, 0.9935481789569958], ['RandomForest', 6, 0.9905120617662391], ['GradientBoosting', 1, 0.9905120617662391]]


In [47]:
automl_best_run.get_file_names()

# Download the yaml file that includes the environment dependencies
automl_best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'env.yml')

In [14]:
# Download the model file

best_run.download_file('outputs/model.pkl', 'Automl_model.pkl')

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('0',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=1.0,
                                                                               importance_type='split',
                                                                               learning_rate=0.1,
                                                                               max_

In [46]:
automl_best_run.download_file('outputs/model.pkl', 'Automl_model1.pkl')

In [48]:
print(fitted_automl_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('MaxAbsScaler', MaxAbsScaler(copy=True)),
                ('SGDClassifierWrapper',
                 SGDClassifierWrapper(alpha=5.3061693877551015,
                                      class_weight=None, eta0=0.01,
                                      fit_intercept=False,
                                      l1_ratio=0.7959183673469387,
                                      learning_rate='invscaling',
                     

In [50]:
automl_best_run

Experiment,Id,Type,Status,Details Page,Docs Page
machine-failure-prediction,AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_5,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [51]:
# Save the best model
automl_best_run.register_model(model_name = "best_run_automl.pkl", model_path = './outputs/')

print(automl_best_run)

Run(Experiment: machine-failure-prediction,
Id: AutoML_3b267046-c349-4020-a32f-ceb49235dc7e_5,
Type: azureml.scriptrun,
Status: Completed)


## Model Deployment



As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I deploy it in the cell below, register it, create an inference config and deploy the model as a web service.

In [53]:

model = remote_run.register_model(model_name = 'best_run_automl.pkl')
print(remote_run.model_id)

# https://knowledge.udacity.com/questions/463620

environment = automl_best_run.get_environment()
entry_script='inference/scoring.py'
automl_best_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)


inference_config = InferenceConfig(entry_script = entry_script, environment = environment)

# Deploying the model via ACI WebService
# https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/machine-learning/how-to-deploy-azure-container-instance.md

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                                    memory_gb = 1, 
                                                    auth_enabled= True, 
                                                    enable_app_insights= True)

service = Model.deploy(ws, "aciservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)

best_run_automl.pkl
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [54]:

# Getting the service state
# The scorig URI & the primary authentication key are copied to the endpoint.py file in order to test the deployed service.
# The Swagger URI can be used in Swagger UI: https://petstore.swagger.io/ For more info, please see the relevant part in the README file.

# Authentication is enabled, so I use the get_keys method to retrieve the primary and secondary authentication keys:
primary, secondary = service.get_keys()

print('Service state: ' + service.state)
print('Service scoring URI: ' + service.scoring_uri)
print('Service Swagger URI: ' + service.swagger_uri)
print('Service primary authentication key: ' + primary)

Service state: Healthy
Service scoring URI: http://0374820d-e61c-400f-8887-1162cd267bea.southcentralus.azurecontainer.io/score
Service Swagger URI: http://0374820d-e61c-400f-8887-1162cd267bea.southcentralus.azurecontainer.io/swagger.json
Service primary authentication key: jX8tIlEfd0SLlf6aTKf8tWCRWrxAbIyX


In [59]:
# Sending a request to the deployed web service to test it: consuming model endpoint

%run endpoint.py

{"result": [false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, f

In [60]:

# Printing the logs
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2021-03-06T19:37:29,060438000+00:00 - rsyslog/run 
2021-03-06T19:37:29,063052500+00:00 - gunicorn/run 
2021-03-06T19:37:29,060210700+00:00 - iot-server/run 
2021-03-06T19:37:29,110836700+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_661474bbe74e96b5d8added5888dfc85/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

# Deleting the service
Putting the deletion of the service in a separate cell to avoid accidentally running the cell before finishing the tasks

In [None]:
service.delete()

In [None]:
conda install-c anaconda py-xgboost==0.90