# Automated ML


## Azure Machine Learning and Pipeline SDK-specific imports

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.27.0


## Initialize Workspace

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-144966
aml-quickstarts-144966
southcentralus
a0a76bad-11a1-4a2d-9887-97a29122c8ed


## Create an Azure ML experiment

In [3]:
# choose a name for experiment
experiment_name = 'loan-prediction'
project_folder = './loan-prediction-project'

experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
loan-prediction,quick-starts-ws-144966,Link to Azure Machine Learning studio,Link to Documentation


## Create a compute cluster


In [5]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_v2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=10)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster, use it.


## Dataset

### Overview
In this project, we use a [loan prediction problem dataset](https://www.kaggle.com/altruistdelhite04/loan-prediction-problem-dataset) from Kaggle.
The dataset contains 11 features and the target column **Loan_Status**.

In [6]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "raw-loan-prediction-dataset"
description_text = "Loan prediction dataset before cleaning"

if key in ws.datasets.keys(): 
        found = True
        raw_dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        example_data = 'https://raw.githubusercontent.com/fnakashima/nd00333-capstone/master/starter_file/dataset/train_u6lujuX_CVtuZ9i.csv'
        raw_dataset = Dataset.Tabular.from_delimited_files(example_data)        
        #Register Dataset in Workspace
        raw_dataset = raw_dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)

df = raw_dataset.to_pandas_dataframe()
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,False,0,Graduate,False,5849,0.0,,360.0,1.0,Urban,True
1,LP001003,Male,True,1,Graduate,False,4583,1508.0,128.0,360.0,1.0,Rural,False
2,LP001005,Male,True,0,Graduate,True,3000,0.0,66.0,360.0,1.0,Urban,True
3,LP001006,Male,True,0,Not Graduate,False,2583,2358.0,120.0,360.0,1.0,Urban,True
4,LP001008,Male,False,0,Graduate,False,6000,0.0,141.0,360.0,1.0,Urban,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,False,0,Graduate,False,2900,0.0,71.0,360.0,1.0,Rural,True
610,LP002979,Male,True,3+,Graduate,False,4106,0.0,40.0,180.0,1.0,Rural,True
611,LP002983,Male,True,1,Graduate,False,8072,240.0,253.0,360.0,1.0,Urban,True
612,LP002984,Male,True,2,Graduate,False,7583,0.0,187.0,360.0,1.0,Urban,True


## Clean dataset

In [7]:
def clean_data(data):
    # Dict for cleaning data
    dependents = {"0":0, "1":1, "2":2, "3+":3}
    property_areas = {"Urban":1, "Semiurban":2, "Rural":3}

    # Clean and one hot encode data
    x_df = data.dropna()
    x_df.drop("Loan_ID", axis=1, inplace=True)

    # Filtering "True", "Yes", "Y" won't work as it will be recoginised as a boolean value automatically by dataset framework
    x_df.loc[:,('Gender')] = x_df.Gender.apply(lambda s: 1 if s == "Male" else 2)
    x_df.loc[:,('Married')] = x_df.Married.apply(lambda s: 1 if s else 0)
    x_df.loc[:,('Dependents')] = x_df.Dependents.map(dependents)
    x_df.loc[:,('Education')] = x_df.Education.apply(lambda s: 1 if s == "Graduate" else 0)
    x_df.loc[:,('Self_Employed')] = x_df.Self_Employed.apply(lambda s: 1 if s else 0)
    x_df.loc[:,('Property_Area')] = x_df.Property_Area.map(property_areas)

    y_df = x_df.pop("Loan_Status").apply(lambda s: 1 if s else 0)
    return x_df, y_df

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd

df
x, y = clean_data(df)
x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,3
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,2,0,0,1,0,2900,0.0,71.0,360.0,1.0,3
610,1,1,3,1,0,4106,0.0,40.0,180.0,1.0,3
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,1


In [9]:
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 478, dtype: int64

## Split train and test data

In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd


# Split data into train and test sets.
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# Default test_size: 0.25
x_train, x_test, y_train, y_test = train_test_split(x, y)

# concatenate train data x and y and test
# https://pandas.pydata.org/docs/user_guide/merging.html
train_data = pd.concat([x_train, y_train], axis=1)
pd.DataFrame(train_data)

test_data = pd.concat([x_test, y_test], axis=1)
pd.DataFrame(test_data)

train_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
306,2,0,0,1,0,3762,1666.0,135.0,360.0,1.0,3,1
115,1,1,1,1,0,14583,0.0,185.0,180.0,1.0,3,1
261,1,0,0,1,0,2060,2209.0,134.0,360.0,1.0,2,1
594,1,1,0,1,1,16120,0.0,260.0,360.0,1.0,1,1
589,1,1,2,1,1,2726,0.0,106.0,360.0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
513,1,1,0,1,0,2130,6666.0,70.0,180.0,1.0,2,0
370,2,0,0,1,1,15759,0.0,55.0,360.0,1.0,2,1
300,1,1,0,0,0,1800,2934.0,93.0,360.0,0.0,1,0
173,1,1,0,1,0,5708,5625.0,187.0,360.0,1.0,2,1


### Register training and test data for use in AutoML

In [11]:
# To specify train and test dataset in AutoMLConfig, register the datasets in the cloud.
from azureml.data.dataset_factory import TabularDatasetFactory

ds_name = "loan-prediction-dataset"
ds_description = "Loan prediction dataset after cleaning"

datastore = ws.get_default_datastore()
train_ds = TabularDatasetFactory.register_pandas_dataframe(train_data, datastore, ds_name, description=ds_description)

train_df = train_ds.to_pandas_dataframe()
#train_df

test_ds_name = "loan-prediction-dataset-test"
test_ds_description = "Loan prediction dataset after cleaning for testing"

test_ds = TabularDatasetFactory.register_pandas_dataframe(test_data, datastore, test_ds_name, description=test_ds_description)

test_df = test_ds.to_pandas_dataframe()
#test_df

Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/b540f988-c19e-4dd6-819d-3958060b6610/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/8d92054c-7d77-4016-a5b5-68b037ade038/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


## AutoML Configuration

Since the target problem is to predict customer's loan status (binary prediction: 0 or 1), the task type is **classification** and the target label column is **Loan_Status**.

We use **accuracy** as a primary metric and set **30** as **experiment_timeout_minutes** to limit experiment running duration.

We set **10** as **max_concurrent_iterations** to run iterations in the experiment in parallel.
**featurization** is set to **auto** to enable featurization step to be done automatically.

Reference: [AutoMLConfig Class](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py)


In [12]:
automl_settings = {
    "primary_metric" : 'accuracy',
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 10,
    "featurization" : 'auto'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=train_ds,
                             validation_data=test_ds,
                             label_column_name="Loan_Status",   
                             path = project_folder,
                             debug_log = "automl_errors.log",
                             enable_onnx_compatible_models=True,
                             **automl_settings
                            )

In [13]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = False)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
loan-prediction,AutoML_4257511d-b32b-4012-a978-9d7122e751d8,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

In the cell below, use the `RunDetails` widget to show the different experiments.

In [14]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [15]:
remote_run.wait_for_completion(show_output=True)

Experiment,Id,Type,Status,Details Page,Docs Page
loan-prediction,AutoML_4257511d-b32b-4012-a978-9d7122e751d8,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

*******************************************************

{'runId': 'AutoML_4257511d-b32b-4012-a978-9d7122e751d8',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T14:22:45.249277Z',
 'endTimeUtc': '2021-05-16T15:01:27.651886Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"9c036174-42a3-4b1b-ad18-8bc34cad9736\\"}, \\"validation_data\\": {\\"datasetId\\": \\"4dd91a27-1774-488a-94ef-e98ed1ee1f57\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.27.0", "azureml-train": "1.27.0", "azureml-train-restclients-hyperdrive": "1.27.0", "azureml-train-core": "1.27.0", "azureml-train-automl": "1.27.0", 

## Best Model

In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [16]:
# Retrieve best model from Pipeline Run
best_automl_run, fitted_automl_model = remote_run.get_output()
print(best_automl_run)
best_automl_run_metrics = best_automl_run.get_metrics()
print('Accuracy: ', best_automl_run_metrics['accuracy'])
print('Other metrics:')
# Display all metrics of the best run
for metric_name in best_automl_run_metrics:
    if metric_name == 'accuracy':
        continue

    metric = best_automl_run_metrics[metric_name]
    print(metric_name, ": ", metric)

Run(Experiment: loan-prediction,
Id: AutoML_4257511d-b32b-4012-a978-9d7122e751d8_45,
Type: azureml.scriptrun,
Status: Completed)
Accuracy:  0.8083333333333333
Other metrics:
average_precision_score_macro :  0.8112461124359782
f1_score_macro :  0.7618842205159175
balanced_accuracy :  0.7428269405013591
norm_macro_recall :  0.4856538810027182
f1_score_micro :  0.8083333333333333
precision_score_macro :  0.8489583333333333
recall_score_weighted :  0.8083333333333333
AUC_weighted :  0.7970401691331924
recall_score_macro :  0.7428269405013591
weighted_accuracy :  0.8641038827462074
AUC_micro :  0.8371527777777779
log_loss :  0.5361653960998835
average_precision_score_weighted :  0.8189985386741242
precision_score_weighted :  0.8297743055555554
recall_score_micro :  0.8083333333333333
precision_score_micro :  0.8083333333333333
f1_score_weighted :  0.7916817645874673
matthews_correlation :  0.5821906369765381
AUC_macro :  0.7970401691331924
average_precision_score_micro :  0.813471016153943


In [17]:
fitted_automl_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                  feature_sweeping_config=None, feature_sweeping_timeout=None,
                  featurization_config=None, force_text_dnn=None,
                  is_cross_validation=None, is_onnx_compatible=None, logger=None,
                  observer=None, task=None, working_dir=None)),
 ('SparseNormalizer',
  <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer at 0x7f045953eb00>),
 ('XGBoostClassifier',
  XGBoostClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
                    learning_rate=0.1, max_delta_step=0, max_depth=6,
                    max_leaves=31, min_child_weight=1, missing=nan,
                    n_estimators=100, n_jobs=1, nthread=None,
                    objective='reg:logistic', random_state=0,
                    reg_alpha=0.8333333333333334, reg_lambda=0,
           

### Regiser and save the best model

In [20]:
# Retrieve and save your best automl model.
best_automl_run, onnx_model = remote_run.get_output(return_onnx_model=True)
best_automl_run_metrics = best_automl_run.get_metrics()

automl_model = remote_run.register_model(model_name = 'loan-prediction-automl-model',
                                  tags={'Method':'AutoML'},
                                  description='AutoML Model trained on loan prediction data to predict a loan status of customers')

In [21]:
# Save the model in the local project folder
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', project_folder + '/service_env.yml')
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', project_folder + '/score.py')
best_automl_run.download_file('outputs/model.pkl', project_folder + '/outputs/model.pkl')

### _Standout Suggestions - Convert your model to ONNX format_

In [22]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

onnx_model_path = project_folder + '/outputs/model.onnx'
OnnxConverter.save_onnx_model(onnx_model, onnx_model_path)

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

#### The following codes are just for testing.

In [None]:
from azureml.core import Environment

service_env = Environment.from_conda_specification(name='service-env', file_path=project_folder + '/service_env.yml')

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core import Model

service_name = 'loan-prediction-automl-service'

inference_config = InferenceConfig(entry_script=project_folder + '/score.py', environment=service_env)
aci_config = AciWebservice.deploy_configuration(cpu_cores=1,
                                                memory_gb=1,
                                                enable_app_insights=True,
                                                description="Loan status prediction service")

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[automl_model],
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)
service.wait_for_deployment(show_output=True)

In [None]:
import json

input_payload = json.dumps({
    'data': x_test[0:3].values.tolist()
})

output = service.run(input_payload)

print(output)

In [None]:
y_test[0:3].values

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
# Show the logs of the web service
print(service.get_logs())

In [None]:
# Delete the service
service.delete()