# Automated ML

## Import Dependencies

In [1]:
import os
import csv
from pprint import pprint
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import ComputeTarget, AmlCompute

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


### 2. Initialize Workspace

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-137761
aml-quickstarts-137761
southcentralus
81cefad3-d2c9-4f77-a466-99a7f541c7bb


### 3. Initialize Experiment

In [3]:
ws = Workspace.from_config()
experiment_name = 'auto-experiment'
experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
auto-experiment,quick-starts-ws-137761,Link to Azure Machine Learning studio,Link to Documentation


### 4. Create Compute Cluster

In [6]:
cpu_cluster_name = "hyperdrive-compu"
vm_size='STANDARD_D14_V2'

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=10)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# Can poll for a minimum number of nodes and for a specific timeout. 
# If no min node count is provided it uses the scale settings for the cluster.
compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### 5. Dataset

In [7]:
data = datasets.load_breast_cancer()
print(data.data.shape)
print(data.feature_names)
print(data.DESCR)

(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - peri

In [8]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df['target']=data.target
df.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [9]:
pd.Series(data.target).value_counts(normalize=True)

1    0.627417
0    0.372583
dtype: float64

**Our dataset is slightly imbalanced**

In [10]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.data.datapath import DataPath
# Create TabularDataset using TabularDatasetFactory
def_blob_store = ws.get_default_datastore()
print("Default datastore's name: {}".format(def_blob_store.name))
data_path = DataPath(datastore=def_blob_store, path_on_datastore='datapath')
ds = TabularDatasetFactory.register_pandas_dataframe(df, name='UCI_ML_Breast_Cancer', target=data_path)

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Default datastore's name: workspaceblobstore
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to datapath/5e6d4177-8965-4e99-83a2-a9190e1be837/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


## AutoML Configuration:

Our task is `classification`

In [22]:
automl_settings = {
    "experiment_timeout_minutes": 25,
    "max_concurrent_iterations": 10,
    "primary_metric" : 'accuracy'}

automl_config = AutoMLConfig(
    task='classification',
    training_data = ds,
    label_column_name = "target",
    compute_target=compute_target,
    **automl_settings)

## Run Details

In [23]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

RunDetails(remote_run).show()

Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [24]:
remote_run.wait_for_completion(show_output=True)


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
              
DETAILS:      
+---------------------------------+
|Number of folds                  |
|10                               |
+---------------------------------+

****************************************************************************************************

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalanced

{'runId': 'AutoML_180ef008-9bb3-46e4-8e6e-64a66d835c40',
 'target': 'hyperdrive-compu',
 'status': 'Completed',
 'startTimeUtc': '2021-02-07T07:37:04.267154Z',
 'endTimeUtc': '2021-02-07T08:09:01.40534Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'hyperdrive-compu',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"99baf4dd-e47f-485a-b409-20ff01bfc546\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"datapath/5e6d4177-8965-4e99-83a2-a9190e1be837/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137761\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"81cefad3-d2c9-4f77-

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [30]:
# Retrieve and save your best automl model.
best_run, fitted_model = remote_run.get_output()
# get_metrics()
# Returns the metrics
print("Best run metrics :",best_run.get_metrics())
# get_details()
# Returns a dictionary with the details for the run
print("Best run details :",best_run.get_details())

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Best run metrics : {'balanced_accuracy': 0.9854676440849343, 'f1_score_micro': 0.987625313283208, 'precision_score_micro': 0.987625313283208, 'AUC_macro': 0.9963579512143124, 'precision_score_weighted': 0.9883351195972386, 'AUC_weighted': 0.9963579512143124, 'recall_score_weighted': 0.987625313283208, 'average_precision_score_weighted': 0.9967590444840608, 'precision_score_macro': 0.988327067669173, 'average_precision_score_micro': 0.9964311117596619, 'weighted_accuracy': 0.9897722338333474, 'recall_score_micro': 0.987625313283208, 'f1_score_macro': 0.9864051261050873, 'log_loss': 0.06840767880916371, 'recall_score_macro': 0.9854676440849343, 'f1_score_weighted': 0.9875504672580133, 'norm_macro_recall': 0.9709352881698686, 'matthews_correlation': 0.9737259340055987, 'accuracy': 0.987625313283208, 'AUC_micro': 0.9963578569701195, 'average_precision_score_macro': 0.9965934318774204, 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_180ef008-9bb3-46e4-8e6e-64a66d835c40_188/acc

In [31]:
fitted_model._final_estimator

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('166',
                                           Pipeline(memory=None,
                                                    steps=[('standardscalerwrapper',
                                                            <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7fb08c774d68>),
                                                           ('logisticregression',
                                                            LogisticRegression(C=6866.488450042998,
                                                                               class_weight=None,
                                                                               dual=False,
                                                                               fit_intercept=True,
                                                                               intercept_scaling=1,...
             

In [32]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                        gamma='scale',
                                                                                        kernel='rbf',
                                                                                        max_iter=-1,
                                                                                      

In [33]:
# Get all metrics of the best run
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

balanced_accuracy 0.9854676440849343
f1_score_micro 0.987625313283208
precision_score_micro 0.987625313283208
AUC_macro 0.9963579512143124
precision_score_weighted 0.9883351195972386
AUC_weighted 0.9963579512143124
recall_score_weighted 0.987625313283208
average_precision_score_weighted 0.9967590444840608
precision_score_macro 0.988327067669173
average_precision_score_micro 0.9964311117596619
weighted_accuracy 0.9897722338333474
recall_score_micro 0.987625313283208
f1_score_macro 0.9864051261050873
log_loss 0.06840767880916371
recall_score_macro 0.9854676440849343
f1_score_weighted 0.9875504672580133
norm_macro_recall 0.9709352881698686
matthews_correlation 0.9737259340055987
accuracy 0.987625313283208
AUC_micro 0.9963578569701195
average_precision_score_macro 0.9965934318774204
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_180ef008-9bb3-46e4-8e6e-64a66d835c40_188/accuracy_table
confusion_matrix aml://artifactId/ExperimentRun/dcid.AutoML_180ef008-9bb3-46e4-8e6e-64a66d835c40

In [34]:
# Print detailed parameters of the fitted model
from pprint import pprint
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['166', '119', '113', '111', '112', '13', '88', '83', '116'],
 'weights': [0.18181818181818182,
             0.09090909090909091,
             0.09090909090909091,
             0.09090909090909091,
             0.09090909090909091,
             0.18181818181818182,
             0.09090909090909091,
             0.09090909090909091,
             0.09090909090909091]}

166 - standardscalerwrapper
{'class_name': 'StandardScaler',
 'copy': True,
 'module_name': 'sklearn.preprocessing._data',
 'with_mean': True,
 'with_std': True}

166 - logisticregression
{'C': 6866.488450042998,
 'class_weight': None,
 'dual': False,
 'fi

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

In the cell below, register the model, create an inference config and deploy the model as a web service.

In [35]:
#Save the model
model = best_run.register_model(model_path='outputs/model.pkl', model_name='automl_breast_cancer_predictor',
                                tags={'Training context':'Auto ML'},
                                properties={'Accuracy': best_run_metrics['accuracy']})

print(model)

Model(workspace=Workspace.create(name='quick-starts-ws-137761', subscription_id='81cefad3-d2c9-4f77-a466-99a7f541c7bb', resource_group='aml-quickstarts-137761'), name=automl_breast_cancer_predictor, id=automl_breast_cancer_predictor:1, version=1, tags={'Training context': 'Auto ML'}, properties={'Accuracy': '0.987625313283208'})


In [36]:
# Download scoring file 
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

# Download environment file
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'envFile.yml')

In [37]:
from azureml.core.model import InferenceConfig

inference_config = InferenceConfig(entry_script='score.py',
                                    environment=best_run.get_environment())

In [38]:
from azureml.core.webservice import AciWebservice
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

In [39]:
from azureml.core import Model
service_name = 'breast-cancer-endpoint'
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)
service.wait_for_deployment(show_output=True)
print(service.state)
print(service.scoring_uri)
print(service.swagger_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running............................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy
http://2099fe8f-747d-425d-b77b-751b8871d5a9.southcentralus.azurecontainer.io/score
http://2099fe8f-747d-425d-b77b-751b8871d5a9.southcentralus.azurecontainer.io/swagger.json


In [40]:
service

AciWebservice(workspace=Workspace.create(name='quick-starts-ws-137761', subscription_id='81cefad3-d2c9-4f77-a466-99a7f541c7bb', resource_group='aml-quickstarts-137761'), name=breast-cancer-endpoint, image_id=None, compute_type=None, state=ACI, scoring_uri=Healthy, tags=http://2099fe8f-747d-425d-b77b-751b8871d5a9.southcentralus.azurecontainer.io/score, properties={}, created_by={'azureml.git.repository_uri': 'https://github.com/bhadreshpsavani/Breast-Cancer-Prediction-AzureML.git', 'mlflow.source.git.repoURL': 'https://github.com/bhadreshpsavani/Breast-Cancer-Prediction-AzureML.git', 'azureml.git.branch': 'main', 'mlflow.source.git.branch': 'main', 'azureml.git.commit': '65810cf926536ed5caecaa6b7515b0242ec21d38', 'mlflow.source.git.commit': '65810cf926536ed5caecaa6b7515b0242ec21d38', 'azureml.git.dirty': 'True', 'hasInferenceSchema': 'True', 'hasHttps': 'False'})

In the cell below, send a request to the web service you deployed to test it.

In [41]:
import json

test_df = df.sample(2)
label_df = test_df.pop('target')
test_sample = json.dumps({'data': test_df.to_dict(orient='records')})
print(test_sample)

{"data": [{"mean radius": 14.74, "mean texture": 25.42, "mean perimeter": 94.7, "mean area": 668.6, "mean smoothness": 0.08275, "mean compactness": 0.07214, "mean concavity": 0.04105, "mean concave points": 0.03027, "mean symmetry": 0.184, "mean fractal dimension": 0.0568, "radius error": 0.3031, "texture error": 1.385, "perimeter error": 2.177, "area error": 27.41, "smoothness error": 0.004775, "compactness error": 0.01172, "concavity error": 0.01947, "concave points error": 0.01269, "symmetry error": 0.0187, "fractal dimension error": 0.002626, "worst radius": 16.51, "worst texture": 32.29, "worst perimeter": 107.4, "worst area": 826.4, "worst smoothness": 0.106, "worst compactness": 0.1376, "worst concavity": 0.1611, "worst concave points": 0.1095, "worst symmetry": 0.2722, "worst fractal dimension": 0.06956}, {"mean radius": 11.29, "mean texture": 13.04, "mean perimeter": 72.23, "mean area": 388.0, "mean smoothness": 0.09834, "mean compactness": 0.07608, "mean concavity": 0.03265, 

In [42]:
%%time
import requests 

# Set the content type
headers = {'Content-type': 'application/json'}

response = requests.post(service.scoring_uri, test_sample, headers=headers)
print("response")
print(response.text)

response
"{\"result\": [1, 1]}"
CPU times: user 1.39 ms, sys: 4.13 ms, total: 5.53 ms
Wall time: 125 ms


In the cell below, print the logs of the web service and delete the service

In [43]:
print(service.get_logs())

2021-02-07T08:20:26,084274449+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-02-07T08:20:26,085830974+00:00 - rsyslog/run 
2021-02-07T08:20:26,086069478+00:00 - iot-server/run 
2021-02-07T08:20:26,091234863+00:00 - nginx/run 
rsyslogd

In [94]:
service.delete()

In [95]:
model.delete()