# Automated ML

In [1]:
!pip install --upgrade azureml-sdk[explain,automl]

Requirement already up-to-date: azureml-sdk[automl,explain] in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (1.33.0)


In [2]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.datapath import DataPath

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.33.0


## Dataset

### Overview
For this Capstone project I will be using a dataset from Kaggle to predict the water potability based on a few technical metrics such as pH, hardness, solids, chloramines, sulfate, trihalomethanes, etc.
This is a classification exercise where the potability (target variable) can take the value of 1 for potable and 0 for not potable.

In [3]:
ws = Workspace.from_config()
# choose a name for experiment
experiment_name = 'automl_h2o_potability'
project_folder = "automl_h2o_potability"

experiment=Experiment(ws, experiment_name)

In [4]:
# This cell will either retrieve or create/register the dataset if not already existent on the AzureML Workspace
# For this code to work, the data should have been placed in a "data" folder alongside this Jupyter Notebook
# The water_potability.csv file was downloaded from Kaggle
found = False
key = "water_potability"
description_text = "Water Potability Dataset from Kaggle"

if key in ws.datasets.keys(): 
        print("Dataset already registered!")
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        print("Creating and registering dataset in Workspace")
        default_ds = ws.get_default_datastore()
        default_ds.upload(src_dir="data", target_path="data")
        dataset = Dataset.Tabular.from_delimited_files(default_ds.path("data/water_potability.csv"))     
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Creating and registering dataset in Workspace
Uploading an estimated of 1 files
Uploading data/water_potability.csv
Uploaded data/water_potability.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


## Compute Configuration

In [5]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
cluster_name = "h2o-pred-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           min_nodes=10, 
                                                           max_nodes=10)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

InProgress......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded..............................................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

The next cell defines the configuraiton for the AutoML experiment.

The primary metric selected is AUC_weighted since it's an imbalanced dataset.

The maximum number of concurrent iterations is set to 9 since I provisioned a cluster with 10 nodes to speed up the process.

This is a classification exercise, therefore the "task" parameter of the AutoMLConfig is set accordingly.
The target column (label_column_name) is set to Potability.

In [6]:
automl_settings = {
    "experiment_timeout_minutes": 60,
    "max_concurrent_iterations": 9,
    "primary_metric" : 'AUC_weighted'
}

automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="Potability",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [7]:
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
automl_h2o_potability,AutoML_8e6a87a0-c720-4cd8-9e8d-c9e0bd592376,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

In [8]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model



In [9]:
best_model_output, fitted_model = remote_run.get_output()

In [10]:
best_model_output

Experiment,Id,Type,Status,Details Page,Docs Page
automl_h2o_potability,AutoML_8e6a87a0-c720-4cd8-9e8d-c9e0bd592376_80,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [11]:
fitted_model.steps

[('datatransformer',
  DataTransformer(
      task='classification',
      is_onnx_compatible=False,
      enable_feature_sweeping=True,
      enable_dnn=False,
      force_text_dnn=False,
      feature_sweeping_timeout=86400,
      featurization_config=None,
      is_cross_validation=True,
      feature_sweeping_config={}
  )),
 ('prefittedsoftvotingclassifier',
  PreFittedSoftVotingClassifier(
      estimators=[('51', Pipeline(
          memory=None,
          steps=[('robustscaler', RobustScaler(
              copy=True,
              quantile_range=[25, 75],
              with_centering=False,
              with_scaling=True
          )), ('svcwrapper', SVCWrapper(
              random_state=None,
              C=494.1713361323828,
              break_ties=False,
              cache_size=200,
              class_weight=None,
              coef0=0.0,
              decision_function_shape='ovr',
              degree=3,
              gamma='scale',
              kernel='rbf',
        

In [12]:
type(fitted_model)

sklearn.pipeline.Pipeline

In [13]:
import pickle

with open("automl_best_model.pkl", "wb") as f:
    pickle.dump(fitted_model, f)

## Model Deployment

Since the AutoML model had a higher AUC_weighted metric when compared to the HyperDrive model, I will deploy it as a web service.

#### This cell will register the model in the AzureML workspace

In [14]:
from azureml.core.model import Model

model = Model.register(model_path="automl_best_model.pkl",
                       model_name="water_potability_prediction",
                       tags={"data": "water potatibility kaggle", "model": "classification"},
                       description="Classification AutoML model to predict water potability",
                       workspace=ws)

Registering model water_potability_prediction


#### The following cells will get a reference to the AutoML environment that was used to train the best performing model

In [15]:
environment = remote_run.get_best_child().get_environment()

In [16]:
environment

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "AzureML-AutoML",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge",

In [17]:
environment.save_to_directory("automl_environment", overwrite=True)

#### The following cells will prepare the scoring script, the container instance and the inference configuration, which are then used to deploy the model as a web service

In [18]:
%%writefile score.py
import json
import numpy as np
import pandas as pd
import os
import pickle
import joblib

def init():
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'automl_best_model.pkl')
    model = joblib.load(model_path)

def run(raw_data):
    data = pd.DataFrame.from_dict(json.loads(raw_data)['data'])
    # make prediction
    y_hat = model.predict(data)
    return y_hat.tolist()

Writing score.py


In [19]:
from azureml.core.webservice import AciWebservice

aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               tags={"data": "water potatibility kaggle",  "method" : "sklearn"}, 
                                               description='Predict water potability')

In [20]:
%%time
import uuid
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core import Workspace
from azureml.core.model import Model

model = Model(ws, 'water_potability_prediction')

inference_config = InferenceConfig(entry_script="score.py", environment=environment)

service_name = 'water-potability-svc-' + str(uuid.uuid4())[:4]
service = Model.deploy(workspace=ws, 
                       name=service_name, 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-08-24 22:38:00+00:00 Creating Container Registry if not exists..
2021-08-24 22:48:01+00:00 Registering the environment.
2021-08-24 22:48:01+00:00 Use the existing image.
2021-08-24 22:48:01+00:00 Generating deployment configuration.
2021-08-24 22:48:02+00:00 Submitting deployment to compute..
2021-08-24 22:48:06+00:00 Checking the status of deployment water-potability-svc-6e38..
2021-08-24 22:52:47+00:00 Checking the status of inference endpoint water-potability-svc-6e38.
Succeeded
ACI service creation operation finished, operation "Succeeded"
CPU times: user 19.1 s, sys: 768 ms, total: 19.8 s
Wall time: 14min 58s


#### The following cell will enable Azure Application Insights

In [21]:
service.update(enable_app_insights=True)

#### The following cells will call the web service passing 10 sample records from the dataframe loaded at the beginning of this notebook (which was also used to train the model)

In [22]:
import json
X_test = df.sample(10)
y_test = X_test.pop("Potability")
data = json.dumps({"data": X_test.to_dict(orient="records")})

In [23]:
json.loads(data)['data']

[{'ph': 7.264069096752565,
  'Hardness': 216.62239216240556,
  'Solids': 19412.11272261369,
  'Chloramines': 6.93245793887396,
  'Sulfate': nan,
  'Conductivity': 301.95355082979233,
  'Organic_carbon': 19.210990803311574,
  'Trihalomethanes': 67.71258467249298,
  'Turbidity': 3.9602821137582938},
 {'ph': 5.5419790193089264,
  'Hardness': 185.67187362857936,
  'Solids': 20143.06526175514,
  'Chloramines': 10.236175974570024,
  'Sulfate': 209.47105843525284,
  'Conductivity': 517.9357855247945,
  'Organic_carbon': 18.46401541280278,
  'Trihalomethanes': 49.37131866346893,
  'Turbidity': 3.6729759831771327},
 {'ph': 6.511618075241813,
  'Hardness': 181.87342172896433,
  'Solids': 29136.814623869737,
  'Chloramines': 7.685775096243834,
  'Sulfate': 345.75163845059745,
  'Conductivity': 475.3413506694927,
  'Organic_carbon': 12.321232157466568,
  'Trihalomethanes': 52.43761714005619,
  'Turbidity': 4.850433707754168},
 {'ph': 8.35761299314161,
  'Hardness': 163.09825445217052,
  'Solids': 

In [24]:
rest_endpoint = service.scoring_uri
rest_endpoint

'http://7f27e047-79eb-428d-8697-2ab6ccb36ec6.southcentralus.azurecontainer.io/score'

In [25]:
import requests

response = requests.post(rest_endpoint, 
                         data=data
                        )

In [26]:
try:
    response.raise_for_status()
except Exception:    
    raise Exception("Received bad response from the endpoint: {}\n"
                    "Response Code: {}\n"
                    "Headers: {}\n"
                    "Content: {}".format(rest_endpoint, response.status_code, response.headers, response.content))

In [27]:
response.json()

[0, 1, 0, 1, 0, 0, 0, 0, 1, 0]

In [28]:
y_test

1997    1
385     1
181     0
3139    1
1386    0
2488    0
1253    0
2842    1
2832    1
445     0
Name: Potability, dtype: int64

#### The following cell will retrieve and display the logs from the web service

In [29]:
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2021-08-24T23:07:15,489184700+00:00 - iot-server/run 
2021-08-24T23:07:15,495402500+00:00 - gunicorn/run 
Dynamic Python package installation is disabled.
Starting HTTP server
2021-08-24T23:07:15,499393300+00:00 - rsyslog/run 
2021-08-24T23:07:15,535897200+00:00 - nginx/run 
rsyslogd: /azureml-envs/azureml_3489174eb648a475780c9959ff366072/lib/libuuid.so.1: no version information available (required by rsyslogd)
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2021-08-24T23:07:15,955117200+00:00 - iot-server/finish 1 0
2021-08-24T23:07:15,966838400+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (64)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 91
SPARK_HOME not set. Skipping PySpark Initialization.
Generating new fontManager, this may take some time...
Initializing logger
2021-08-24 23:07:18,642 | root | INFO | Starting up app insights client
logging socket was 

In [30]:
service.delete()