# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.pipeline.steps import AutoMLStep

import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources
from matplotlib import pyplot as plt

import logging
import os
import csv

from scipy import stats
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.18.0


## 1. Dataset

### 1.1 Overview

✅ In this notebook we are going to use the Cardiovascular Disease dataset from Kaggle. Cardiovascular Disease dataset is a Kaggle Dataset the containts history of health status of some persons. A group of them suffered a heart attackt. So using this dataset we can train a model in order to predict if a person could suffer a heart attack.

We can download the data from Kaggle page (https://www.kaggle.com/sulianova/cardiovascular-disease-dataset). In this case, I've download the data in the /data directory. So then we have to register this Dataset.

In [2]:
fileCardioData = 'kaggle/cardio_train.csv'
df = pd.read_csv(fileCardioData, encoding='latin')
df.head(2)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1


In [3]:
import os 
dataDir = 'data'

if not os.path.exists(dataDir):
    os.mkdir(dataDir)

fileData = dataDir + "/initialfile.parquet"

df.to_csv(fileData, index=False)

print("Data written to local folder")

Data written to local folder


### 1.2 Upload to Azure Blob

In [4]:
from azureml.core import Workspace

ws = Workspace.from_config()
print("Workspace: " + ws.name, "Region: " + ws.location, sep = '\n')

# Default datastore
default_store = ws.get_default_datastore() 

default_store.upload_files([fileData], 
                           target_path = 'cardio', 
                           overwrite = True, 
                           show_progress = True)

print("Upload completed")

Workspace: quick-starts-ws-126639
Region: southcentralus
Uploading an estimated of 1 files
Uploading data/initialfile.parquet
Uploaded data/initialfile.parquet, 1 files out of an estimated total of 1
Uploaded 1 files
Upload completed


### 1.3 Create and register datasets

In [5]:
from azureml.core import Dataset
cardio_data = Dataset.Tabular.from_delimited_files(default_store.path('cardio/initialfile.parquet'))

In [6]:
cardio_data = cardio_data.register(ws, 'cardio_data')

## 2. Setup Compute

In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
amlcompute_cluster_name = "compt-cluster"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [8]:
# Define RunConfig for the compute
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','numpy'], 
    pip_packages=['azureml-sdk[automl,explain]', 'scipy'])

print ("Run configuration created.")

Run configuration created.


# 3. Prepare Data

### 3.1 Cleaning Data

In [9]:
# initial columns to use
cols_touse = str(['age', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']).replace(",", ";")

In [10]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './scripts'

# Define output after cleansing step
cleaned_data = PipelineData("cleaned_data", datastore=default_store).as_dataset()

print('Cleaning script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# Cleaning step creation
cleaningStep = PythonScriptStep(
    name="Clean Data",
    script_name="clean.py", 
    arguments=["--useful_columns", cols_touse,
               "--output_clean", cleaned_data],
    inputs=[cardio_data.as_named_input('raw_data')],
    outputs=[cleaned_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("Clean Step created")

Cleaning script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/compt-inst/code/nd00333_AZMLND_Capstone_Project/scripts.
Clean Step created


### 3.2 Filtering Data

In [11]:
# Define output after merging step
filtered_data = PipelineData("filtered_data", datastore=default_store).as_dataset()

print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# filter step creation
# See the filter.py for details about input and output
filterStep = PythonScriptStep(
    name="Filter Data",
    script_name="filter.py", 
    arguments=["--output_filter", filtered_data],
    inputs=[cleaned_data.parse_parquet_files()],
    outputs=[filtered_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("Filter Step created")

Filter script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/compt-inst/code/nd00333_AZMLND_Capstone_Project/scripts.
Filter Step created


### 3.3 Transform Data

In [12]:
# Define output after transform step
transformed_data = PipelineData("transformed_data", datastore=default_store).as_dataset()

print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# transform step creation
# See the transform.py for details about input and output
transformStep = PythonScriptStep(
    name="Transform Data",
    script_name="transform.py", 
    arguments=["--output_transform", transformed_data],
    inputs=[filtered_data.parse_parquet_files()],
    outputs=[transformed_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("Transform Step created")

Transform script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/compt-inst/code/nd00333_AZMLND_Capstone_Project/scripts.
Transform Step created


### 3.4 Split Data into train and test sets

In [13]:
# train and test splits output
output_split_train = PipelineData("output_split_train", datastore=default_store).as_dataset()
output_split_test = PipelineData("output_split_test", datastore=default_store).as_dataset()

print('Data spilt script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# test train split step creation
# See the train_test_split.py for details about input and output
testTrainSplitStep = PythonScriptStep(
    name="Train Test Data Split",
    script_name="train_test_split.py", 
    arguments=["--output_split_train", output_split_train,
               "--output_split_test", output_split_test],
    inputs=[transformed_data.parse_parquet_files()],
    outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("TrainTest Split Step created")

Data spilt script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/compt-inst/code/nd00333_AZMLND_Capstone_Project/scripts.
TrainTest Split Step created


## 4. AutoML 

In [14]:
from azureml.core import Experiment

experiment = Experiment(ws, 'AutoML-Pipeline')

print("Experiment created")

Experiment created


### 4.1 AutmoML Configuration

In [15]:
import logging
from azureml.train.automl import AutoMLConfig

automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted',
    "n_cross_validations": 5
}

training_dataset = output_split_train.parse_parquet_files()

automl_config = AutoMLConfig(compute_target=aml_compute,
                             model_explainability=True,
                             task = "classification",
                             training_data=training_dataset,
                             label_column_name="cardio",  
                             path = prepare_data_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',#
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

print("AutoML config created")

AutoML config created


In [16]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [17]:
from azureml.pipeline.steps import AutoMLStep

trainWithAutomlStep = AutoMLStep(name='AutoML_Classification',
                                 outputs=[metrics_data, model_data],
                                 automl_config=automl_config,
                                 allow_reuse=True)
print("trainWithAutomlStep created")

trainWithAutomlStep created


### 4.2 Pipeline

In [18]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainWithAutomlStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

Pipeline is built.


### 4.3 Experiment

In [19]:
pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

print("Pipeline submitted for execution.")

Created step AutoML_Classification [0d66a518][447ecce9-87c6-4add-9864-cd3a767a88b6], (This step will run and generate new outputs)
Created step Train Test Data Split [785b90b7][d5226027-cda1-42f8-8a8a-de8bf3dd71d4], (This step is eligible to reuse a previous run's output)
Created step Transform Data [fdfae2ea][3ccc5a3e-09ad-4bcd-b0cb-f773b3add7fc], (This step is eligible to reuse a previous run's output)
Created step Filter Data [fcbc072c][0a500117-5da0-4cff-957b-ecc10a4ab4d3], (This step is eligible to reuse a previous run's output)
Created step Clean Data [5556bce1][c31cd514-8e03-4788-bede-067c6e47d56a], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun de46368c-746c-4d1c-aeda-48acd9444f22
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/AutoML-Pipeline/runs/de46368c-746c-4d1c-aeda-48acd9444f22?wsid=/subscriptions/2552278b-2817-43a7-820e-5a5a53ff9e19/resourcegroups/aml-quickstarts-126639/workspaces/quick-starts-ws-126639
Pipeline su

### 4.4 RunDetails

In [20]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [21]:
# Before we proceed we need to wait for the run to complete.
pipeline_run.wait_for_completion(show_output=False)

PipelineRunId: de46368c-746c-4d1c-aeda-48acd9444f22
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/AutoML-Pipeline/runs/de46368c-746c-4d1c-aeda-48acd9444f22?wsid=/subscriptions/2552278b-2817-43a7-820e-5a5a53ff9e19/resourcegroups/aml-quickstarts-126639/workspaces/quick-starts-ws-126639
{'runId': 'de46368c-746c-4d1c-aeda-48acd9444f22', 'status': 'Completed', 'startTimeUtc': '2020-11-17T16:21:30.043251Z', 'endTimeUtc': '2020-11-17T17:28:59.863805Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlstrg126639.blob.core.windows.net/azureml/ExperimentRun/dcid.de46368c-746c-4d1c-aeda-48acd9444f22/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=YHtuX4jNk2zZXl5bcKuuxb022ftF1dDZDCOgtdEMm%2Bo%3D&st=2020-11-17T17%3A12%3A00Z&se=2020-11-18T01%3A22%3A00Z&sp=r', 'logs/azureml/stderrlogs.txt': 'ht

'Finished'

### 4.5 Explore Results

In [22]:
# functions to download output to local and fetch as dataframe
def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path =  download_path + '/azureml/' + output_folder + '/' + output_name
    return path

def fetch_df(step, output_name):
    output_data = step.get_output_data(output_name)    
    download_path = './outputs/' + output_name
    output_data.download(download_path, overwrite=True)
    df_path = get_download_path(download_path, output_name) + '/processed.parquet'
    return pd.read_parquet(df_path)

In [23]:
# Original data
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [24]:
# View cleaned data
result_clean_step = pipeline_run.find_step_run(cleaningStep.name)[0]
result_cleaned_df = fetch_df(result_clean_step, cleaned_data.name)
result_cleaned_df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,168,62.0,110,80,1,1,0,0,1,0
1,20228,156,85.0,140,90,3,1,0,0,1,1
2,18857,165,64.0,130,70,3,1,0,0,0,1
3,17623,169,82.0,150,100,1,1,0,0,1,1
4,17474,156,56.0,100,60,1,1,0,0,0,0


In [25]:
# View filtered data
result_filter_step = pipeline_run.find_step_run(filterStep.name)[0]
result_filtered_df = fetch_df(result_filter_step, filtered_data.name)
result_filtered_df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.4,168.0,62.0,110.0,80.0,1,1,0,0,1,0.0,22.0
1,55.4,156.0,85.0,140.0,90.0,3,1,0,0,1,1.0,34.9
2,51.6,165.0,64.0,130.0,70.0,3,1,0,0,0,1.0,23.5
3,48.2,169.0,82.0,150.0,100.0,1,1,0,0,1,1.0,28.7
4,47.8,156.0,56.0,100.0,60.0,1,1,0,0,0,0.0,23.0


In [26]:
# View transformed data
result_transform_step = pipeline_run.find_step_run(transformStep.name)[0]
result_transformed_df = fetch_df(result_transform_step, transformed_data.name)
result_transformed_df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,bmi,cholesterol_above normal,cholesterol_normal,cholesterol_well above normal,gluc_above normal,gluc_normal,gluc_well above normal,smoke_No,smoke_Yes,alco_No,alco_Yes,active_No,active_Yes
0,50.4,168.0,2.154936,1.49967,80.0,0.0,0.987265,0,1,0,0,1,0,1,0,1,0,0,1
1,55.4,156.0,2.221375,1.510934,90.0,1.0,1.004914,0,0,1,0,1,0,1,0,1,0,0,1
2,51.6,165.0,2.161949,1.50765,70.0,1.0,0.990253,0,0,1,0,1,0,1,0,1,0,1,0
3,48.2,169.0,2.214165,1.513858,100.0,1.0,0.998311,0,1,0,0,1,0,1,0,1,0,0,1
4,47.8,156.0,2.131942,1.494733,60.0,0.0,0.989298,0,1,0,0,1,0,1,0,1,0,1,0


In [27]:
# View training data used for AutoML
result_split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]
result_training_df = fetch_df(result_split_step, output_split_train.name)
result_training_df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,bmi,cholesterol_above normal,cholesterol_normal,cholesterol_well above normal,gluc_above normal,gluc_normal,gluc_well above normal,smoke_No,smoke_Yes,alco_No,alco_Yes,active_No,active_Yes
0,50.4,160.0,2.21901,1.503929,70.0,0.0,1.002944,0,1,0,0,1,0,1,0,1,0,0,1
1,57.2,165.0,2.165346,1.503929,80.0,0.0,0.99099,0,1,0,0,1,0,1,0,1,0,0,1
2,52.4,156.0,2.181339,1.510934,80.0,1.0,0.998439,0,1,0,0,1,0,1,0,1,0,0,1
3,43.7,157.0,2.198622,1.49967,80.0,0.0,1.000831,1,0,0,1,0,0,1,0,1,0,0,1
4,55.7,160.0,2.147614,1.510934,70.0,1.0,0.990065,0,1,0,0,1,0,1,0,1,0,1,0


In [28]:
# View the details of the AutoML run
from azureml.train.automl.run import AutoMLRun

for step in pipeline_run.get_steps():
    automl_step_run_id = step.id
    print(step.name)
    print(automl_step_run_id)
    break

automl_run = AutoMLRun(experiment = experiment, run_id=automl_step_run_id)
RunDetails(automl_run).show()

AutoML_Classification
2be8e96d-8c3b-409d-bc36-8c48663e0cf9


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [29]:
# Retrieve all Child runs
children = list(automl_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
recall_score_macro,0.733675,0.734573,0.723761,0.727375,0.728049,0.728602,0.682087,0.720517,0.721055,0.718376,...,0.707492,0.688184,,0.733046,,,,,0.73487,0.735109
accuracy,0.734012,0.734959,0.724179,0.727821,0.728385,0.729023,0.682334,0.721119,0.721611,0.718479,...,0.707881,0.688852,,0.733411,,,,,0.735232,0.735414
precision_score_micro,0.734012,0.734959,0.724179,0.727821,0.728385,0.729023,0.682334,0.721119,0.721611,0.718479,...,0.707881,0.688852,,0.733411,,,,,0.735232,0.735414
AUC_weighted,0.800529,0.801659,0.789372,0.790539,0.790385,0.795693,0.75391,0.783692,0.783564,0.785359,...,0.773537,0.759714,,0.799472,,,,,0.802032,0.802015
f1_score_macro,0.733437,0.734271,0.72351,0.726925,0.727454,0.728117,0.676892,0.719994,0.720285,0.718245,...,0.707191,0.686273,,0.732815,,,,,0.734592,0.734884
recall_score_micro,0.734012,0.734959,0.724179,0.727821,0.728385,0.729023,0.682334,0.721119,0.721611,0.718479,...,0.707881,0.688852,,0.733411,,,,,0.735232,0.735414
recall_score_weighted,0.734012,0.734959,0.724179,0.727821,0.728385,0.729023,0.682334,0.721119,0.721611,0.718479,...,0.707881,0.688852,,0.733411,,,,,0.735232,0.735414
average_precision_score_macro,0.785507,0.786672,0.771321,0.771845,0.771873,0.779317,0.73636,0.765796,0.765941,0.766729,...,0.757907,0.745038,,0.783565,,,,,0.787117,0.787191
f1_score_weighted,0.733567,0.734414,0.723665,0.727087,0.727587,0.72827,0.67701,0.720208,0.720482,0.718315,...,0.707351,0.686531,,0.732952,,,,,0.734727,0.735005
norm_macro_recall,0.467349,0.469146,0.447521,0.45475,0.456099,0.457205,0.364174,0.441034,0.442109,0.436752,...,0.414984,0.376369,,0.466092,,,,,0.46974,0.470218


### 4.6 Best Model

In [30]:
# Get best model
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: AutoML-Pipeline,
Id: 2be8e96d-8c3b-409d-bc36-8c48663e0cf9_30,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_impurity_decrease=0.0,
                                                                                                    min_impurity_split=None,
                  

In [31]:
# Save best model
import joblib
model_name = '/best_run_automl.pkl'
model_dir = 'outputs/' + 'model'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

filename = model_dir + model_name
joblib.dump(fitted_model, filename)

['outputs/model/best_run_automl.pkl']

### 4.7 Test Model

In [32]:
split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]
x_test = fetch_df(split_step, output_split_test.name)[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi','cholesterol_above normal', 'cholesterol_normal','cholesterol_well above normal', 'gluc_above normal', 'gluc_normal','gluc_well above normal', 'smoke_No', 'smoke_Yes', 'alco_No','alco_Yes', 'active_No', 'active_Yes']]
y_test = fetch_df(split_step, output_split_test.name)[['cardio']]

In [33]:
y_predict = fitted_model.predict(x_test)
y_actual =  y_test.values.tolist()
pd.DataFrame({'Actual':y_actual, 'Predicted':y_predict}).head(4)

Unnamed: 0,Actual,Predicted
0,[1.0],1.0
1,[0.0],0.0
2,[0.0],0.0
3,[0.0],0.0


In [34]:
from sklearn.metrics import roc_auc_score,accuracy_score
print("AUC test AutoML model: " + str(roc_auc_score(y_test, y_predict)))

AUC test AutoML model: 0.7317006544128257
