# Optimizing a Machine Learning Pipeline

### create workspace and experiment

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-132087")
exp = Experiment(workspace=ws, name="bowen-optimizing-ml-pipeline")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-132087
Azure region: southcentralus
Subscription id: 81cefad3-d2c9-4f77-a466-99a7f541c7bb
Resource group: aml-quickstarts-132087


### create compute cluster

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
amlcompute_cluster_name = "bowen-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
    
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperparameter tuning with HyperDrive

### Create estimator with the new configuration from the sample notebook

write to an environment file

In [3]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Writing conda_dependencies.yml


Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on.

In [4]:
import shutil

script_folder = "training"

if script_folder not in os.listdir():
    os.mkdir(script_folder)

try:
    shutil.move('train.py', script_folder)
except:
    pass

create a folder that contains all the environment yaml files

In [5]:
environment_folder = "envs"

if environment_folder not in os.listdir():
    os.mkdir(environment_folder)
try:
    shutil.move('conda_dependencies.yml', environment_folder)
except:
    pass

create a environment configuration

In [6]:
from azureml.core import Environment

sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './envs/conda_dependencies.yml')

### Configure the training job

Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.

In [7]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=script_folder,
                      script='train.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

### Create the Hyperparamter Tuning using HyperDrive


define sampling method and create early termination policy

In [8]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import BayesianParameterSampling, RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice, loguniform
import os

# Specify parameter sampler
ps = BayesianParameterSampling(
    {
        '--C': uniform(0.1, 2.0),
        '--max_iter': choice(10, 20, 50, 100, 150)
    }
)
# Specify a Policy
policy = BanditPolicy(evaluation_interval=3, slack_factor=0.1)

### Create HyperDrive Configuration

In [9]:
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps, 
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=40,
                                     max_concurrent_runs=4)

### Submit hyperdrive run

In [10]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

In [11]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [12]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_85d07143-ec18-4757-b81d-1f14b3786357
Web View: https://ml.azure.com/experiments/bowen-optimizing-ml-pipeline/runs/HD_85d07143-ec18-4757-b81d-1f14b3786357?wsid=/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourcegroups/aml-quickstarts-132087/workspaces/quick-starts-ws-132087

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-12-27T22:58:57.429609][API][INFO]Experiment created<END>\n"<START>[2020-12-27T22:58:58.8270747Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-12-27T22:59:00.190118][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-12-27T22:59:00.353802][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_85d07143-ec18-4757-b81d-1f14b3786357
Web View: https://ml.azure.com/experiments/bowen-optimizing-ml-pipeline/runs/HD_85d07143-ec18-4757-b81d-1f14b3786357

{'runId': 'HD_85d07143-ec18-4757-b81d-1f14b3786357',
 'target': 'bowen-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-27T22:58:57.208199Z',
 'endTimeUtc': '2020-12-27T23:25:29.859209Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '73f338f7-843e-4784-add9-561b0a23f9f4',
  'score': '0.912451719698363',
  'best_child_run_id': 'HD_85d07143-ec18-4757-b81d-1f14b3786357_39',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg132087.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_85d07143-ec18-4757-b81d-1f14b3786357/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=3xDt8BuxaguqWmi7Fr%2FknqrTnWUrA2M9KQhKGs2BALY%3D&st=2020-12-27T23%3A16%3A02Z&se=2020-12-28T07%3A26%3A02Z&sp=r'}}

In [13]:
assert(hyperdrive_run.get_status() == "Completed")

### Save the best model from HyperDrive

In [14]:
import joblib

# Get your best run and register the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
model = best_run.register_model(model_name='bank-marketing-predictions', 
                                model_path='./outputs/model.joblib',
                                tags=best_run.get_metrics())

# get all file names
best_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_61f720a1321fd67fafb01123574e6a2970df95ff457d85aab167c983678afbd6_d.txt',
 'azureml-logs/65_job_prep-tvmps_61f720a1321fd67fafb01123574e6a2970df95ff457d85aab167c983678afbd6_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_61f720a1321fd67fafb01123574e6a2970df95ff457d85aab167c983678afbd6_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/104_azureml.log',
 'logs/azureml/dataprep/backgroundProcess.log',
 'logs/azureml/dataprep/backgroundProcess_Telemetry.log',
 'logs/azureml/dataprep/engine_spans_l_c020003f-c523-4151-b794-8539738c8245.jsonl',
 'logs/azureml/dataprep/python_span_l_c020003f-c523-4151-b794-8539738c8245.jsonl',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/model.joblib']

In [15]:
hyperdrive_run.get_best_run_by_primary_metric()

Experiment,Id,Type,Status,Details Page,Docs Page
bowen-optimizing-ml-pipeline,HD_85d07143-ec18-4757-b81d-1f14b3786357_39,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


## AutoML on the same problem

### create dataset

create dataset using the TabularDatasetFactory

In [16]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory().from_delimited_files(
        path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    )

clean the dataset using the clean data function from  train.py

In [17]:
import sys
sys.path.append(".")

In [18]:
from training.train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

AutoML require y to be included inside X

In [19]:
x["y"] = y

train test split

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(
        x,  test_size=0.33, random_state=7
    )

convert dataframe into a TabularDataSet object, which is required by AutoML

In [21]:
from azureml.core.dataset import Dataset

data_folder = "./data"

if data_folder not in os.listdir():
    os.makedirs(data_folder, exist_ok=True)

# save to csv
x_train.to_csv(f"{data_folder}/training_data.csv")

# upload dataframe to default datastore
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)

training_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/training_data.csv'))

Uploading an estimated of 1 files
Uploading ./data/training_data.csv
Uploaded ./data/training_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


### configure automl settings

configure automl

In [22]:
import logging 

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_minutes": 30,
    "enable_early_stopping": True,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

initiate autoML config

In [23]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig

automl_config = AutoMLConfig(
    task= "classification",
    training_data=training_data,
    label_column_name="y",
    compute_target=compute_target,
     **automl_settings)

### submit autoML run

In [24]:
# Submit your automl run

from azureml.core.experiment import Experiment
exp_auto = Experiment(ws, "automatic-bank-marketing-model")
auto_run = exp_auto.submit(automl_config, show_output=True)
auto_run.wait_for_completion()

Running on remote.
No run_configuration provided, running on bowen-cluster with default configuration
Running on remote compute: bowen-cluster
Parent Run ID: AutoML_ea7f8b9f-3cd6-4d7b-a8ad-322d11d22089

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------

{'runId': 'AutoML_ea7f8b9f-3cd6-4d7b-a8ad-322d11d22089',
 'target': 'bowen-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-12-27T23:26:46.840842Z',
 'endTimeUtc': '2020-12-28T00:05:01.873595Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'bowen-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"51a141f1-7784-4010-8f71-c3ebafecd903\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"bankmarketing/training_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-132087\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"81cefad3-d2c9-4f77-a466-99a7f541c7bb

In [25]:
RunDetails(auto_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

### retrieve and save the best model

In [26]:
# Retrieve and save your best automl model.
best_run_customized, fitted_model_customized = auto_run.get_output()

In [34]:
fitted_model_customized.steps[1][1].estimators

[('21',
  Pipeline(memory=None,
           steps=[('standardscalerwrapper',
                   <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7f3162217a58>),
                  ('xgboostclassifier',
                   XGBoostClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eta=0.5, gamma=0.1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=6, max_leaves=0,
                                     min_child_weight=1, missing=nan,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='reg:logistic', random_state=0,
                                     reg_alpha=2.1875, reg_lambda=0,
                                     scale_pos_weight=1, seed=None, silent=None,
                            

In [35]:
fitted_model_customized

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               objective='reg:logistic',
                                                                                               random_state=0,
                                                                                               reg_alpha=2.291666666666667,
                                    

register the best model

In [27]:
best_run_customized

Experiment,Id,Type,Status,Details Page,Docs Page
automatic-bank-marketing-model,AutoML_ea7f8b9f-3cd6-4d7b-a8ad-322d11d22089_25,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [28]:
model = best_run_customized.register_model(model_name='bank-marketing-predictions-from-automl', 
                                model_path='./outputs/model.pkl',
                                tags=best_run_customized.get_metrics())