# Automated ML

## Dependencies 

All the dependencies needed to complete the project appear here.

In [30]:
import sys !conda install — yes — prefix {sys.prefix} py-xgboost !conda install -y anaconda py-xgboost
anaconda/envs/azureml_py36/lib/libxgboost.so: undefined symbol: XGBoosterUnserializeFromBuffer

SyntaxError: invalid syntax (<ipython-input-30-7c77293c9f1c>, line 1)

In [None]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

from azureml.pipeline.steps import AutoMLStep

from azureml.widgets import RunDetails

import joblib

from azureml.core.environment import Environment 
from azureml.core.model import InferenceConfig 
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Workspace

The `config.json` file is downloaded from Azure environment and has to be in the project folder in order for this cell to run.

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Create an Azure ML experiment
I am creating an experiment named `heart-failure-prediction` and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step.


In [None]:
# Choose a name for the run history container in the workspace.

experiment_name = 'heart-failure-prediction'
project_folder = './capstone-project'

experiment = Experiment(ws, experiment_name)
experiment

run = experiment.start_logging()

## Create or Attach a cluster

We will need to create a [compute target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#compute-target) for the AutoML run. In case the compute target (named `compute-cluster` in this script) is not found, a new one is created using the default AmlCompute as the training compute resource.

In [None]:
# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = "compute-cluster2"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    # Poll for a minimum number of nodes (min_nodes = 1). 
    # If no min node count is provided it uses the scale settings for the cluster.
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', min_nodes=1, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())


## Dataset

### Overview

The dataset used is taken from [Kaggle](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data) and the data comes from 299 patients with heart failure collected at the Faisalabad Institute of Cardiology and at the Allied Hospital in Faisalabad (Punjab, Pakistan), during April–December 2015. The patients consisted of both women (105) and men (194), and the main task of the project is to classify the patients based on their odds of survival.

Dataset features:

| Feature | Explanation |
| :---: | :---: |
| *age* | Age of patient |
| *anaemia* | Decrease of red blood cells or hemoglobin |
| *creatinine-phosphokinase* | Level of the CPK enzyme in the blood |
| *diabetes* | Whether the patient has diabetes or not |
| *ejection_fraction* | Percentage of blood leaving the heart at each contraction |
| *high_blood_pressure* | Whether the patient has hypertension or not |
| *platelets* | Platelets in the blood |
| *serum_creatinine* | Level of creatinine in the blood |
| *serum_sodium* | Level of sodium in the blood |
| *sex* | Female (F) or Male (M) |
| *smoking* | Whether the patient smokes or not |
| *time* | Follow-up period |
| *DEATH_EVENT* | Whether the patient died during the follow-up period |


In [None]:
data = pd.read_csv('./heart_failure_clinical_records_dataset.csv')

found = False
key = "heart-failure-prediction"
description_text = "Prediction of survival of patients with heart failure"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        my_dataset = 'https://raw.githubusercontent.com/ddgope/Udacity-Capstone-Heart-Failure-Prediction/master/heart_failure_clinical_records_dataset.csv'
        dataset = Dataset.Tabular.from_delimited_files(my_dataset)        
        # Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)
                                
# Preview of the first five rows
print(data.head())

# Explore data
print(data.describe())

df = dataset.to_pandas_dataframe()
df.describe()

# Data columns
df.columns = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']
x = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
y = df[['DEATH_EVENT']]


## AutoML Configuration

Here is an overview of the `automl` settings and configuration I used for the AutoML run:

`"n_cross_validations": 2`

This parameter sets how many cross validations to perform, based on the same number of folds (number of subsets). As one cross-validation could result in overfit, in my code I chose 2 folds for cross-validation; thus the metrics are calculated with the average of the 2 validation metrics.

`"primary_metric": 'accuracy'`

I chose accuracy as the primary metric as it is the default metric used for classification tasks.

`"enable_early_stopping": True`

It defines to enable early termination if the score is not improving in the short term. In this experiment, it could also be omitted because the _experiment_timeout_minutes_ is already defined below.

`"max_concurrent_iterations": 4`

It represents the maximum number of iterations that would be executed in parallel.

`"experiment_timeout_minutes": 20`

This is an exit criterion and is used to define how long, in minutes, the experiment should continue to run. To help avoid experiment time out failures, I used the value of 20 minutes.

`"verbosity": logging.INFO`

The verbosity level for writing to the log file.

`compute_target = compute_target`

The Azure Machine Learning compute target to run the Automated Machine Learning experiment on.

`task = 'classification'`

This defines the experiment type which in this case is classification. Other options are _regression_ and _forecasting_.

`training_data = dataset`

The training data to be used within the experiment. It should contain both training features and a label column - see next parameter.

`label_column_name = 'DEATH_EVENT'` 

The name of the label column i.e. the target column based on which the prediction is done.

`path = project_folder`

The full path to the Azure Machine Learning project folder.

`featurization = 'auto'`

This parameter defines whether featurization step should be done automatically as in this case (_auto_) or not (_off_).

`debug_log = 'automl_errors.log`

The log file to write debug information to.

`enable_onnx_compatible_models = False`

I chose not to enable enforcing the ONNX-compatible models at this stage. However, I will try it in the future. For more info on Open Neural Network Exchange (ONNX), please see [here](https://docs.microsoft.com/en-us/azure/machine-learning/concept-onnx).


In [None]:
# Automl settings

automl_settings = {"n_cross_validations": 2,
                    "primary_metric": 'accuracy',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 4,
                    "experiment_timeout_minutes": 25,
                    "verbosity": logging.INFO                   
                    }

# Parameters for AutoMLConfig

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            training_data=dataset,
                            label_column_name='DEATH_EVENT',
                            path = project_folder,
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=False,
                            blocked_models='XGBoostClassifier', #XGBoostClassifier = 'XGBoostClassifier'
                            **automl_settings
                            )

In [None]:
# Submit the experiment
remote_run = experiment.submit(automl_config, show_output = True)
remote_run.wait_for_completion()

Running on remote.
Running on remote compute: compute-cluster2
Parent Run ID: AutoML_afc8f16a-8a6b-41e9-ac0d-958c70470a91



In [None]:
# get_status()
# Fetch the latest status of the run. It should show 'Completed'
print("Run Status: ",remote_run.get_status())

## Run Details

In the cell below, I use the `RunDetails` widget and show the children runs of the experiment.

In [None]:

RunDetails(remote_run).show()

# Get details from each run
for child_run in remote_run.get_children():
    print('===================================================')
    print(child_run)


## Best Model
In the cell below, I get the best model from the automl experiment and display all the properties of the model.

In [None]:

best_run, fitted_model = remote_run.get_output()

# get_metrics()
# Returns the metrics
print("Best run metrics :",best_run.get_metrics())
print('===================================================')

# get_details()
# Returns a dictionary with the details for the run
print("Best run details :",best_run.get_details())
print('===================================================')

# get_properties()
# Fetch the latest properties of the run from the service
print("Best run properties :",best_run.get_properties())
print('===================================================')

In [None]:
best_run = remote_run.get_output()


In [None]:
remote_run.get_output()

In [None]:
best_run.get_file_names()

# Download the yaml file that includes the environment dependencies
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'env.yml')

In [None]:
# Download the model file

best_run.download_file('outputs/model.pkl', 'heart_disease_model.pkl')

In [None]:
print(fitted_model)

In [None]:
best_run

In [None]:
# Save the best model
best_run.register_model(model_name = "heart_disease_model.pkl", model_path = './outputs/')

print(best_run)

## Best Model Based on Another Metric

Show the run and model that has the highest **AUC_weighted** and the one with the smallest **average_precision_score_weighted** value:

In [None]:
lookup_metric = "AUC_weighted"
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print('========================================================')
print("Based on AUC_weighted: ",best_run)
print(fitted_model)

lookup_metric = "average_precision_score_weighted"
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print('========================================================')
print("Based on average_precision_score_weighted: ",best_run)
print(fitted_model)

## Deploy the Model

As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I deploy it in the cell below, register it, create an inference config and deploy the model as a web service.

In [None]:
model = remote_run.register_model(model_name = 'heart_disease_model.pkl')
print(remote_run.model_id)

# https://knowledge.udacity.com/questions/463620

environment = best_run.get_environment()
entry_script='inference/scoring.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)


inference_config = InferenceConfig(entry_script = entry_script, environment = environment)

# Deploying the model via ACI WebService
# https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/machine-learning/how-to-deploy-azure-container-instance.md

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                                    memory_gb = 1, 
                                                    auth_enabled= True, 
                                                    enable_app_insights= True)

service = Model.deploy(ws, "aciservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)


In [None]:
# Getting the service state
# The scorig URI & the primary authentication key are copied to the endpoint.py file in order to test the deployed service.
# The Swagger URI can be used in Swagger UI: https://petstore.swagger.io/ For more info, please see the relevant part in the README file.

# Authentication is enabled, so I use the get_keys method to retrieve the primary and secondary authentication keys:
primary, secondary = service.get_keys()

print('Service state: ' + service.state)
print('Service scoring URI: ' + service.scoring_uri)
print('Service Swagger URI: ' + service.swagger_uri)
print('Service primary authentication key: ' + primary)


# Consume the Endpoint

In [None]:
# Sending a request to the deployed web service to test it: consuming model endpoint
%run endpoint.py

In [None]:
#Store the uri's in variables:
scoring_uri = 'http://64f66fd9-4942-4998-b945-a726512c767a.southcentralus.azurecontainer.io/score'

key = 'q0KNVV8uahLC1jGacIbteTFeiAXn1lmf'

In [None]:
import requests
import json

# URL for the web service, should be similar to:
# 'http://8530a665-66f3-49c8-a953-b82a2d312917.eastus.azurecontainer.io/score'
scoring_uri = scoring_uri
# If the service is authenticated, set the key or token
key = key

# Two sets of data to score, so we get two results back
data = {"data":
        [
          {
           "age": 75, 
           "anaemia": 0, 
           "creatinine_phosphokinase": 582, 
           "diabetes": 0, 
           "ejection_fraction": 20, 
           "high_blood_pressure": 1, 
           "platelets": 265000, 
           "serum_creatinine": 1.9, 
           "serum_sodium": 130, 
           "sex": 1, 
           "smoking": 0,
           "time": 4
          },
          {
           "age": 30, 
           "anaemia": 0, 
           "creatinine_phosphokinase": 2656, 
           "diabetes": 1, 
           "ejection_fraction": 30, 
           "high_blood_pressure": 0, 
           "platelets": 305000, 
           "serum_creatinine": 2.3, 
           "serum_sodium": 130, 
           "sex": 1, 
           "smoking": 0,
           "time": 30
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())
print("++++++++++++++++++++++++++++++")
print("Expected result: [true, true], where 'true' means '1' as result in the 'DEATH_EVENT' column")

In [None]:
# Printing the logs
print(service.get_logs())

## Deleting the service
Putting the deletion of the service in a separate cell to avoid accidentally running the cell before finishing the tasks

In [None]:
#Service.delete()
