# Automated ML

## Dependencies 

All the dependencies needed to complete the project appear here.

In [None]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

from azureml.pipeline.steps import AutoMLStep

from azureml.widgets import RunDetails

import joblib

from azureml.core.environment import Environment 
from azureml.core.model import InferenceConfig 
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Workspace

The `config.json` file is downloaded from Azure environment and has to be in the project folder in order for this cell to run.

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

In [None]:
env = Environment.get(workspace=ws, name='AzureML-AutoML')
#This will create a new folder called environ with a .yml and a .json file inside it
env.save_to_directory('./environ', overwrite=True)

#Chekc environment dependencies
print("packages", env.python.conda_dependencies.serialize_to_string())

#get the environment Details and stored them into a file:
f = open("env.yml", "w")
f.write(env.python.conda_dependencies.serialize_to_string())
f.close()

## Create an Azure ML experiment
I am creating an experiment named `heart-failure-prediction` and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.

The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step.


In [None]:
# Choose a name for the run history container in the workspace.

experiment_name = 'heart-failure-prediction'
project_folder = './capstone-project'

experiment = Experiment(ws, experiment_name)
experiment

run = experiment.start_logging()

## Create or Attach a cluster

We will need to create a [compute target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#compute-target) for the AutoML run. In case the compute target (named `compute-cluster` in this script) is not found, a new one is created using the default AmlCompute as the training compute resource.

In [None]:
# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = "compute-cluster2"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    # Poll for a minimum number of nodes (min_nodes = 1). 
    # If no min node count is provided it uses the scale settings for the cluster.
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', min_nodes=1, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())


## Dataset

In [None]:
import pandas as pd
data = pd.read_csv('./house-price-train-data.csv')

found = False
key = "house price train data"
description_text = "PHouse price train data"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        my_dataset = 'https://raw.githubusercontent.com/ddgope/Kaggle-House-Price-Predication-Using-Azure-ML/master/house-price-train-data.csv'
        dataset = Dataset.Tabular.from_delimited_files(my_dataset)        
        # Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)
                                
# Preview of the first five rows
#print(data.head())

# Explore data
#print(data.describe())

df = dataset.to_pandas_dataframe()
df.describe()

## AutoML Configuration


In [None]:
# Automl settings
#'normalized_mean_absolute_error, normalized_root_mean_squared_error, r2_score, spearman_correlation'.",

automl_settings = {"n_cross_validations": 5,
                    "primary_metric": 'normalized_root_mean_squared_error',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 4,
                    "experiment_timeout_minutes": 60,           
                    "blocked_models":['XGBoostClassifier','XGBoostRegressor'],
                    "verbosity": logging.INFO                   
                    }


# Parameters for AutoMLConfig

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='regression',
                            training_data=dataset,
                            label_column_name='SalePrice',
                            path = project_folder,
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=False,                            
                            **automl_settings
                            )

In [None]:
# Submit the experiment
remote_run = experiment.submit(automl_config, show_output = True)
remote_run.wait_for_completion()

In [None]:
# get_status()
# Fetch the latest status of the run. It should show 'Completed'
print("Run Status: ",remote_run.get_status())

## Run Details

In the cell below, I use the `RunDetails` widget and show the children runs of the experiment.

In [None]:

RunDetails(remote_run).show()

# Get details from each run
for child_run in remote_run.get_children():
    print('===================================================')
    print(child_run)


## Best Model
In the cell below, I get the best model from the automl experiment and display all the properties of the model.

In [None]:

#Additional Run Details
from azureml.widgets import RunDetails

RunDetails(remote_run).show()

# wait for completion
remote_run.wait_for_completion()

In [None]:
#Get the best model outputs
best_automl_run = remote_run.get_output()

In [None]:
#Get the best model outputs
best_automl_run, best_model = remote_run.get_output()


# Retrieve the best automl run model
print('Best AutoML run: ', best_automl_run)
print('Best AutoML model :', best_model)

# get best model and display properties
model_name = best_automl_run.properties['model_name']
print('Best_model name: ', model_name)

# display all the properties of the best model
best_automl_run.get_properties()

In [None]:
print(best_automl_run.id)

# Save the best AutoML model

In [None]:
import joblib

joblib.dump(best_model, 'house_price_model.pkl')

## Register and Deploy the Model

As the best model coming from AutoML run has better accuracy than the one coming from the HyperDrive run, I deploy it in the cell below, register it, create an inference config and deploy the model as a web service.

In [None]:
model = remote_run.register_model(model_name = 'house_price_model.pkl')
print(remote_run.model_id)

# https://knowledge.udacity.com/questions/463620

environment = best_automl_run.get_environment()
entry_script='inference/scoring.py'
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)

inference_config = InferenceConfig(entry_script = entry_script, environment = environment)

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                                    memory_gb = 1, 
                                                    auth_enabled= True, 
                                                    enable_app_insights= True)

service = Model.deploy(ws, "aciservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)


In [None]:
# Getting the service state
# The scorig URI & the primary authentication key are copied to the endpoint.py file in order to test the deployed service.
# The Swagger URI can be used in Swagger UI: https://petstore.swagger.io/ For more info, please see the relevant part in the README file.

# Authentication is enabled, so I use the get_keys method to retrieve the primary and secondary authentication keys:
primary, secondary = service.get_keys()

print('Service state: ' + service.state)
print('Service scoring URI: ' + service.scoring_uri)
print('Service Swagger URI: ' + service.swagger_uri)
print('Service primary authentication key: ' + primary)


# Consume the Endpoint

In [45]:
#Store the uri's in variables:
scoring_uri = 'http://ad741a18-f19f-4695-88fe-bf01ed81d1ad.southcentralus.azurecontainer.io/score'

key = 'wSG9Jh4NfEODMbQ8g0RiVAyiKPahAl7I'

In [56]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Workspace, Dataset
import pandas as pd
df_Test=pd.read_csv("./house-price-test-data.csv")
df_Test.head()
#df_Test.shape

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [57]:
# #input data
# input_data = df_Test.to_json(orient="table",index=False)
# #print(input_data)
# input_data=json.dumps({"data": json.loads(input_data)["data"]},indent=4)
# input_data

In [58]:
#let's test requests:
import json
import requests

scoring_uri = scoring_uri
key = key

headers = {'Content-Type':'application/json'}
headers['Authorization'] = f'Bearer {key}'

# Convert to JSON string
input_data = df_Test.to_json(orient="table",index=False)
#print(input_data)
input_data=json.dumps({"data": json.loads(input_data)["data"]},indent=4)

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)

#load the returned prediction and read it into a pandas dataframe
pred = json.loads(resp.text)
pred = pd.read_json(pred)


Cannot convert non-finite values (NA or inf) to integer


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
pred

In [None]:
# Sending a request to the deployed web service to test it: consuming model endpoint
#bfeore running endpoint.py , change the scoring_uri and key in the endpoint.py
%run endpoint.py

In [None]:
import requests
import json

# URL for the web service, should be similar to:
# 'http://8530a665-66f3-49c8-a953-b82a2d312917.eastus.azurecontainer.io/score'
scoring_uri = scoring_uri
# If the service is authenticated, set the key or token
key = key

# Two sets of data to score, so we get two results back
data = {"data":
        [
          {
           "age": 75, 
           "anaemia": 0, 
           "creatinine_phosphokinase": 582, 
           "diabetes": 0, 
           "ejection_fraction": 20, 
           "high_blood_pressure": 1, 
           "platelets": 265000, 
           "serum_creatinine": 1.9, 
           "serum_sodium": 130, 
           "sex": 1, 
           "smoking": 0,
           "time": 4
          },
          {
           "age": 30, 
           "anaemia": 0, 
           "creatinine_phosphokinase": 2656, 
           "diabetes": 1, 
           "ejection_fraction": 30, 
           "high_blood_pressure": 0, 
           "platelets": 305000, 
           "serum_creatinine": 2.3, 
           "serum_sodium": 130, 
           "sex": 1, 
           "smoking": 0,
           "time": 30
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
with open("data.json", "w") as _f:
    _f.write(input_data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())
print("++++++++++++++++++++++++++++++")
print("Expected result: [true, true], where 'true' means '1' as result in the 'DEATH_EVENT' column")

In [None]:
# Printing the logs
print(service.get_logs())

## Deleting the service
Putting the deletion of the service in a separate cell to avoid accidentally running the cell before finishing the tasks

In [None]:
#Service.delete()
