#### Azure Machine Learning and Pipeline SDK-specific imports****

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.57.0


#### Initialize Workspace****

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-268434
aml-quickstarts-268434
westeurope
9e65f93e-bdd8-437b-b1e8-0647cd6098f7


#### Create an Azure ML experiment

In [3]:
experiment_name = 'bankmarketing-experiment-pipeline'
project_folder = './pipeline-project'

experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
bankmarketing-experiment-pipeline,quick-starts-ws-268434,Link to Azure Machine Learning studio,Link to Documentation


#### Create or Attach an AmlCompute cluster****

In [4]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "auto-ml"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded........................

#### Data

In [5]:
found = False
key = "bankmarketing-dataset"
description_text = "Bank Marketing DataSet for Udacity Course 3"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        example_data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
        dataset = Dataset.Tabular.from_delimited_files(example_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,40.040212,257.335205,2.56173,962.17478,0.17478,0.076228,93.574243,-40.51868,3.615654,5166.859608
std,10.432313,257.3317,2.763646,187.646785,0.496503,1.572242,0.578636,4.623004,1.735748,72.208448
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,179.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,318.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [6]:
dataset.take(5).to_pandas_dataframe()

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


#### AutoML Setting

In [7]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'AUC_weighted'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="y",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

#### Create Pipeline and AutoMLStep

In [8]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [9]:
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [10]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [11]:
pipeline_run = experiment.submit(pipeline)

Created step automl_module [b34ebb2e][b174f224-a93a-4234-9a4e-77878ae697ec], (This step will run and generate new outputs)
Submitted PipelineRun e1dc3d64-4f28-4f66-9903-34bde89e4c3c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e1dc3d64-4f28-4f66-9903-34bde89e4c3c?wsid=/subscriptions/9e65f93e-bdd8-437b-b1e8-0647cd6098f7/resourcegroups/aml-quickstarts-268434/workspaces/quick-starts-ws-268434&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254


In [14]:
pipeline_run.wait_for_completion()

PipelineRunId: e1dc3d64-4f28-4f66-9903-34bde89e4c3c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e1dc3d64-4f28-4f66-9903-34bde89e4c3c?wsid=/subscriptions/9e65f93e-bdd8-437b-b1e8-0647cd6098f7/resourcegroups/aml-quickstarts-268434/workspaces/quick-starts-ws-268434&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
PipelineRun Status: Running


StepRunId: d6cfbf93-1914-44ce-8104-03eefdd13da8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d6cfbf93-1914-44ce-8104-03eefdd13da8?wsid=/subscriptions/9e65f93e-bdd8-437b-b1e8-0647cd6098f7/resourcegroups/aml-quickstarts-268434/workspaces/quick-starts-ws-268434&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished

No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK r

'Finished'

#### Examine Results

In [15]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/d6cfbf93-1914-44ce-8104-03eefdd13da8/metrics_data
Downloaded azureml/d6cfbf93-1914-44ce-8104-03eefdd13da8/metrics_data, 1 files out of an estimated total of 1


In [16]:
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

Unnamed: 0,d6cfbf93-1914-44ce-8104-03eefdd13da8_9,d6cfbf93-1914-44ce-8104-03eefdd13da8_10,d6cfbf93-1914-44ce-8104-03eefdd13da8_8,d6cfbf93-1914-44ce-8104-03eefdd13da8_2,d6cfbf93-1914-44ce-8104-03eefdd13da8_0,d6cfbf93-1914-44ce-8104-03eefdd13da8_7,d6cfbf93-1914-44ce-8104-03eefdd13da8_5,d6cfbf93-1914-44ce-8104-03eefdd13da8_12,d6cfbf93-1914-44ce-8104-03eefdd13da8_13,d6cfbf93-1914-44ce-8104-03eefdd13da8_25,...,d6cfbf93-1914-44ce-8104-03eefdd13da8_51,d6cfbf93-1914-44ce-8104-03eefdd13da8_55,d6cfbf93-1914-44ce-8104-03eefdd13da8_50,d6cfbf93-1914-44ce-8104-03eefdd13da8_48,d6cfbf93-1914-44ce-8104-03eefdd13da8_53,d6cfbf93-1914-44ce-8104-03eefdd13da8_46,d6cfbf93-1914-44ce-8104-03eefdd13da8_57,d6cfbf93-1914-44ce-8104-03eefdd13da8_44,d6cfbf93-1914-44ce-8104-03eefdd13da8_54,d6cfbf93-1914-44ce-8104-03eefdd13da8_60
norm_macro_recall,[0.460712942741184],[0.3307724225567614],[0.44210489268255637],[0.17876546502990665],[0.48912840119515355],[0.37276580216246447],[0.38795251247112605],[0.464810399983699],[0.0],[0.3930308031720098],...,[0.4146387772831932],[0.0],[0.4796553467927023],[0.44408323098952107],[0.3483996391570203],[0.4850754009932443],[0.49015369169412804],[0.5098074083953417],[0.4782882927940695],[0.47321000209318553]
matthews_correlation,[0.31148671117064497],[0.444201567492641],[0.5016093372462171],[0.325487580342956],[0.5216286298277554],[0.47805766735773875],[0.47077965319103915],[0.5231250045786537],[0.0],[0.47422181829067633],...,[0.4704464612630845],[0.0],[0.5173685558610612],[0.4875377170946134],[0.4656181360029442],[0.5217153406413008],[0.5249329436765708],[0.5421675093518514],[0.512951950649998],[0.5096832200958874]
accuracy,[0.7314112291350531],[0.9083459787556905],[0.9104704097116844],[0.8995447647951441],[0.9101669195751139],[0.9119878603945372],[0.908649468892261],[0.9138088012139606],[0.8880121396054628],[0.9089529590288316],...,[0.9050075872534142],[0.8880121396054628],[0.9101669195751139],[0.9059180576631259],[0.9113808801213961],[0.910773899848255],[0.9110773899848255],[0.9138088012139606],[0.9089529590288316],[0.908649468892261]
average_precision_score_weighted,[0.9177127009122391],[0.9437518530646575],[0.9529980385601647],[0.93061444699471],[0.9535822261506017],[0.9480633299225389],[0.9492172134275333],[0.9523114199391449],[0.9055283415469914],[0.94835501196234],...,[0.9411472459769631],[0.9455287364961052],[0.9502904099365788],[0.9514575108916562],[0.9512397173282338],[0.953370300219518],[0.9537828238428194],[0.9523999923387977],[0.9461667405664178],[0.9504774566744769]
recall_score_micro,[0.7314112291350531],[0.9083459787556905],[0.9104704097116844],[0.8995447647951441],[0.9101669195751139],[0.9119878603945372],[0.908649468892261],[0.9138088012139606],[0.8880121396054628],[0.9089529590288316],...,[0.9050075872534142],[0.8880121396054628],[0.9101669195751139],[0.9059180576631259],[0.9113808801213961],[0.910773899848255],[0.9110773899848255],[0.9138088012139606],[0.9089529590288316],[0.908649468892261]
recall_score_macro,[0.730356471370592],[0.6653862112783807],[0.7210524463412782],[0.5893827325149533],[0.7445642005975768],[0.6863829010812322],[0.693976256235563],[0.7324051999918495],[0.5],[0.6965154015860049],...,[0.7073193886415966],[0.5],[0.7398276733963511],[0.7220416154947605],[0.6741998195785102],[0.7425377004966222],[0.745076845847064],[0.7549037041976708],[0.7391441463970347],[0.7366050010465928]
precision_score_micro,[0.7314112291350531],[0.9083459787556905],[0.9104704097116844],[0.8995447647951441],[0.9101669195751139],[0.9119878603945372],[0.908649468892261],[0.9138088012139606],[0.8880121396054628],[0.9089529590288316],...,[0.9050075872534142],[0.8880121396054628],[0.9101669195751139],[0.9059180576631259],[0.9113808801213961],[0.910773899848255],[0.9110773899848255],[0.9138088012139606],[0.9089529590288316],[0.908649468892261]
f1_score_micro,[0.7314112291350531],[0.9083459787556905],[0.9104704097116844],[0.8995447647951441],[0.9101669195751139],[0.9119878603945372],[0.908649468892261],[0.9138088012139606],[0.8880121396054628],[0.9089529590288316],...,[0.9050075872534142],[0.8880121396054628],[0.9101669195751139],[0.9059180576631259],[0.9113808801213961],[0.910773899848255],[0.9110773899848255],[0.9138088012139606],[0.9089529590288316],[0.908649468892261]
AUC_macro,[0.8377679231337767],[0.9237121814143637],[0.941632999720291],[0.9032971378927732],[0.9446537630106308],[0.9290011799639528],[0.9388252597495217],[0.9405160165750666],[0.8146053418839041],[0.9310008206028745],...,[0.9258308372557409],[0.9271673270389573],[0.9364569961489089],[0.9387326409149258],[0.9387071707354121],[0.9462347665171797],[0.9446602463290525],[0.9448649339535091],[0.9266630174845836],[0.9383265073252236]
AUC_micro,[0.8245004501693605],[0.9741933909150988],[0.9790036405000448],[0.9685867905802923],[0.9795361989126856],[0.9758368429657296],[0.9779290367296751],[0.9786896502494928],[0.950584989902851],[0.9762675318514971],...,[0.9741460482959189],[0.9729727987178807],[0.9774612290199203],[0.9781102097489875],[0.9780248272431904],[0.9798827026740751],[0.9796716872255521],[0.9794262240346688],[0.9755631952583697],[0.9778574701633275]


#### Retrieve the Best Model

In [17]:
# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/d6cfbf93-1914-44ce-8104-03eefdd13da8/model_data
Downloaded azureml/d6cfbf93-1914-44ce-8104-03eefdd13da8/model_data, 1 files out of an estimated total of 1


In [18]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

In [19]:
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=False, is_onnx_compatible=False, task='classification')),
 ('prefittedsoftvotingclassifier',
  PreFittedSoftVotingClassifier(classification_labels=numpy.array([0, 1]), estimators=[('46', Pipeline(memory=None, steps=[('standardscalerwrapper', StandardScalerWrapper(copy=True, with_mean=False, with_std=True)), ('lightgbmclassifier', LightGBMClassifier(boosting_type='goss', colsample_bytree=0.8911111111111111, learning_rate=0.1, max_bin=180, max_depth=10, min_child_weight=9, min_data_in_leaf=0.013801724137931036, min_split_gain=0.15789473684210525, n_estimators=200, n_jobs=1, num_leaves=53, problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_state=None, reg_alpha=0.42105263157894735, reg_lambda=0.15789473684210525, subsample=1))], verbose=F

#### Test the Model

##### Load Test Data

In [20]:
dataset_test = Dataset.Tabular.from_delimited_files(path='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')
df_test = dataset_test.to_pandas_dataframe()
df_test = df_test[pd.notnull(df_test['y'])]

y_test = df_test['y']
X_test = df_test.drop(['y'], axis=1)

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


##### Testing Our Best Fitted Model

In [21]:
from sklearn.metrics import confusion_matrix
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)

In [22]:
# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,28700,558
1,1294,2398


#### Publish and run from REST endpoint

In [23]:
published_pipeline = pipeline_run.publish_pipeline(
    name="Bankmarketing Train", description="Training bankmarketing pipeline", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
Bankmarketing Train,ff81d248-5f4b-406c-8841-61ce85d03732,Active,REST Endpoint


In [24]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

In [25]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "pipeline-rest-endpoint"}
                        )

In [26]:
try:
    response.raise_for_status()
except Exception:    
    raise Exception("Received bad response from the endpoint: {}\n"
                    "Response Code: {}\n"
                    "Headers: {}\n"
                    "Content: {}".format(rest_endpoint, response.status_code, response.headers, response.content))

run_id = response.json().get('Id')
print('Submitted pipeline run: ', run_id)

Submitted pipeline run:  7cd5eb02-3b5b-4650-83c4-36e94920717e
