# Automated ML


In [1]:
import pandas as pd
import numpy as np
import json
import joblib
import json
import requests
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import OneHotEncoder



In [2]:
from azureml.train.automl import AutoMLConfig
from azureml.core.run import Run
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.webservice import Webservice
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import InferenceConfig

In [3]:
ws = Workspace.from_config()
experiment_name = 'heart_failure_automl'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

In [4]:
amlcompute_cluster_name = "cpu-cluster" 

try:
    compute_target = ComputeTarget(workspace = ws, name = amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_confisguration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

#compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster, use it.


## Dataset

### Overview
* Cardiovascular diseases are the number 1 cause of death globally, taking an estimated 17.9 million lives each year,  Heart failure is a common event caused by CVDs. This dataset contains 12 features that may contribute to the cause of heart failure. Such as age, diabetes, high_blood_pressure and anaemia. 

* The dataset consists of 299 training examples and 13 features, we aim to predict the feature DEATH_EVENT which may have the value 1 in case of death due to heart faulure and 0 in case of survival.
* The task we are performing here is **Classification** and I will use Azure's AutoML to find the model that best fits the data and has the highest accuracy. 

In [5]:
from azureml.core import Dataset
ds = Dataset.get_by_name(ws, name = "heart failure")
df = ds.to_pandas_dataframe()
df.head(3)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1



## AutoML Configuration

AutoML is a powerful tool that enables us to find the best model quickly. For my automl run I've used the following settings & configurations to find best combination of algorithms & hyperparameters:
* The primary metric is set to *accuracy*.
* The task is set to *classification* because we aim to get a binary result either 1 or 0, death or no death.
* We use the traing data we got from the dataset & we define the target column.
* Logs have been generated for debugging reasons. 
* Auto featurization is enabled, featurization includes automated feature engineering and scaling and normalization, which then impacts the selected algorithm and its hyperparameter values.
* Early stopping is enabled to save computational power. 
* Number of cross validation is set to 3.



In [6]:
# automl settings
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'}

# automl config
automl_config = AutoMLConfig(compute_target = compute_target,
                             task = "classification",
                             training_data=ds,
                             label_column_name="DEATH_EVENT",   
                             featurization= 'auto',
                             n_cross_validations = 3,
                            enable_early_stopping= True,
                             debug_log = "automl_logs.log",
                             **automl_settings)

In [7]:
# Submitting the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_eb7abf71-5769-4cd0-840c-b1a76a5bcdfa

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values we

## Run Details

In [10]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_eb7abf71-5769-4cd0-840c-b1a76a5bcdfa',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-30T20:04:54.426521Z',
 'endTimeUtc': '2021-01-30T20:21:39.432415Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"b8c3bd7e-f114-41de-a18e-f8bed1b65595\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/01-30-2021_075753_UTC/heart_failure_clinical_records_dataset.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136642\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"a0a76bad

## Best Model

In [11]:
best_run, model = remote_run.get_output()

#Printing the best run
print(model) 
print('\nBest Run Id: ', best_run.id)

#Printing the metric details of the best run
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               random_state=0,
                                                                                               reg_alpha=0,
                                                                                               reg_lambda=0.10416666666666667,
                                              

In [12]:
#Saving the best model
joblib.dump(model,'outputs/automl_model.pkl')

['outputs/automl_model.pkl']

## Model Deployment

* The AutoML model outperformed the model tuned by HyperDrive so we'll start deploying the best automl model. 

In [13]:
from azureml.core.model import Model
model = Model.register(workspace = ws,
                        model_path ="outputs/automl_model.pkl",
                        model_name = "automl_model")

Registering model automl_model


In [34]:
%%writefile score_automl.py
from azureml.core.model import Model
import numpy as np
import pandas as pd
import joblib
import json
import pickle
import os

def init():
    global model
    model_path = Model.get_model_path("automl_model")
    model = joblib.load(model_path)

def run(raw_data):
    try:
        data = json.loads(raw_data)['data']
        data = pd.DataFrame.from_dict(data)
        result = model.predict(data)
        return result.tolist()
    
    except Exception as ex:
        error = str(ex)
        return error

Overwriting score_automl.py


In [35]:
env = Environment.get(workspace=ws, name = "AzureML-AutoML")
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'myenv.yml')

In [36]:
from azureml.core.model import Model
from azureml.core.webservice import Webservice 
from azureml.core.webservice import  AciWebservice
from azureml.core.conda_dependencies import CondaDependencies
config_aci = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               enable_app_insights=True, 
                                               auth_enabled=True)



ws = Workspace.from_config()
model = Model(ws, 'automl_model')

inference_config = InferenceConfig(entry_script="score_automl.py", environment=env)

service_name = 'automl-service'
service = Model.deploy(workspace=ws, 
                       name=service_name, 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config= config_aci)

service.wait_for_deployment(show_output=True)

print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running...................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [37]:
print(service.get_logs())

2021-01-30T21:20:29,717839000+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-01-30T21:20:29,732206300+00:00 - gunicorn/run 
2021-01-30T21:20:29,745464600+00:00 - iot-server/run 
2021-01-30T21:20:29,755118600+00:00 - rsyslog/run 
rsyslogd

In [38]:
print("scoring URI: " + service.scoring_uri)

print("Swagger URI: " + service.swagger_uri)

print("Authetication Key: " + service.get_keys()[0])

scoring URI: http://46c05dd0-9ce8-498b-affc-0507c6836f0c.southcentralus.azurecontainer.io/score
Swagger URI: http://46c05dd0-9ce8-498b-affc-0507c6836f0c.southcentralus.azurecontainer.io/swagger.json
Authetication Key: UG8HWWu33jOPOyuKKrdiA3QpZBvSbzue


In [39]:
primary_key, secondary_key = service.get_keys()
print(primary_key,'\n',secondary_key)

UG8HWWu33jOPOyuKKrdiA3QpZBvSbzue 
 8R84R2vBqAAoMWjSOKu4UhjCU7RDaBRk


TODO: In the cell below, send a request to the web service you deployed to test it.

In [40]:
key = primary_key
scoringuri = service.scoring_uri

data= { "data":
       [
           {
               'age': 60,
               'anaemia': 245,
               'creatinine_phosphokinase': 0,
               'diabetes': 0,
               'ejection_fraction': 38,
               'high_blood_pressure': 1,
               'platelets': 163000,
               'serum_creatinine': 50,
               'serum_sodium':100,
               'sex':1,
               'smoking':1,
               'time':7
               
               
           }
       ]
    }
input_data = json.dumps(data)

headers = {'Content-Type': 'application/json'}
headers['Authorization'] = f'Bearer {key}'

response = requests.post(scoringuri, input_data, headers = headers)
print(response.text)

[1]


TODO: In the cell below, print the logs of the web service and delete the service

In [42]:
ws = Workspace.from_config()

service = Webservice(name="automl-service", workspace=ws)
service.update(enable_app_insights=True)
logs = service.get_logs()
print(logs)

2021-01-30T21:20:29,717839000+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7ade26eb614f97df8030bc480da59236/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-01-30T21:20:29,732206300+00:00 - gunicorn/run 
2021-01-30T21:20:29,745464600+00:00 - iot-server/run 
2021-01-30T21:20:29,755118600+00:00 - rsyslog/run 
rsyslogd

In [46]:
#Deleting the service
service.delete()
print(service)

AciWebservice(workspace=Workspace.create(name='quick-starts-ws-136642', subscription_id='a0a76bad-11a1-4a2d-9887-97a29122c8ed', resource_group='aml-quickstarts-136642'), name=automl-service, image_id=None, compute_type=None, state=ACI, scoring_uri=Deleting, tags=http://46c05dd0-9ce8-498b-affc-0507c6836f0c.southcentralus.azurecontainer.io/score, properties={}, created_by={'hasInferenceSchema': 'False', 'hasHttps': 'False'})


In [47]:
#Deleting the compute cluster once the training process is complete
compute_target.delete()