In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name)

run = exp.start_logging()

Workspace name: PHSA-DARE-M-SBOX-ML-001


In [2]:
import azureml.core
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.19.0


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
vm_size = "STANDARD_D2_V2"
max_nodes = 4
cluster_name = 'clusterfrank'
cluster_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size,
    min_nodes = 0, max_nodes = max_nodes,
    vm_priority = 'dedicated'
)
cluster = ComputeTarget.create(
    ws, cluster_name, cluster_config
)
cluster.wait_for_completion()

In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    '--C': uniform(0.01, 10)
})

# Specify a Policy
policy = BanditPolicy(
    slack_amount = 0.2,
    evaluation_interval = 1,
    delay_evaluation = 5
)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(
    source_directory = './',
    entry_script = 'train.py',
    compute_target = cluster
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator = est,
    hyperparameter_sampling = ps,
    policy = policy,
    primary_metric_name = 'ROC_AUC',
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
    max_total_runs = 40,
    max_concurrent_runs = 4
)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
run = exp.submit(config = hyperdrive_config)
RunDetails(run).show()
run.wait_for_completion()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_ff8a46f0-b04c-4e2e-ac82-75d56492b334',
 'target': 'clusterfrank',
 'status': 'Completed',
 'startTimeUtc': '2021-01-01T23:35:45.31477Z',
 'endTimeUtc': '2021-01-01T23:57:06.745296Z',
 'properties': {'primary_metric_config': '{"name": "ROC_AUC", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'a2ecc955-e74d-4969-b224-4975a7da889c',
  'score': '0.9294585952254855',
  'best_child_run_id': 'HD_ff8a46f0-b04c-4e2e-ac82-75d56492b334_33',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://phsadaremsboxm6602792567.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_ff8a46f0-b04c-4e2e-ac82-75d56492b334/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=aG%2Frl9bi7mLS2CVDiVbYtC227cXD9fQbrtx03hhSgxc%3D&st=2021-01-01T23%3A47%3A08Z&se=2021-01-02T07%3A57%3A08Z&sp=r'}}

In [6]:
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
best_run_arguments = best_run.get_details()['runDefinition']['arguments']

In [7]:
best_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_5a6a5fc55ef7af0b4f2531d71929a9519ab3eb4e76daca6fab7588fbfa497b77_d.txt',
 'azureml-logs/65_job_prep-tvmps_5a6a5fc55ef7af0b4f2531d71929a9519ab3eb4e76daca6fab7588fbfa497b77_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_5a6a5fc55ef7af0b4f2531d71929a9519ab3eb4e76daca6fab7588fbfa497b77_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/105_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/model.pkl']

In [8]:
best_run_metrics

{'Regularization Strength:': 7.752574958591534,
 'Max iterations:': 100,
 'Accuracy': 0.9105715730905413,
 'ROC_AUC': 0.9294585952254855}

In [9]:
best_run_arguments

['--C', '7.752574958591534']

In [10]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
model = best_run.register_model(
    model_name = 'lgt-hyperdrive', 
    model_path = 'outputs',
    tags = {'training context': 'HyperDrive'},
    properties = {'Accuracy': best_run_metrics['Accuracy'], 
                  'ROC_AUC': best_run_metrics['ROC_AUC'],
                  'Arguments': best_run_arguments}
)

In [11]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dset = TabularDatasetFactory.from_delimited_files(path = url)
dataset = dset.register(
    workspace = ws,
    name = 'bank-marketing',
    description='training and test dataset',
    create_new_version=True
)

In [12]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)
train_set = x.join(y)
train_set.to_csv('train.csv')

In [13]:
from azureml.core import Dataset
datastore = ws.get_default_datastore()
datastore.upload_files(files=['./train.csv'], target_path='data')
tab_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/train.csv'))])

Uploading an estimated of 1 files
Uploading ./train.csv
Uploaded ./train.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [14]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    name = 'project-AutoML',
    experiment_timeout_minutes=30,
    task='classification', 
    compute_target=cluster,
    primary_metric='AUC_weighted',
    training_data = tab_dataset,
    label_column_name = 'y',
    n_cross_validations=5, 
    featurization = 'auto')

In [15]:
# Submit your automl run

### YOUR CODE HERE ###
run = exp.submit(automl_config)
RunDetails(run).show()
run.wait_for_completion(show_output=True)

Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of 

{'runId': 'AutoML_6e977ee5-8502-4653-bf93-424a3780bdd0',
 'target': 'clusterfrank',
 'status': 'Completed',
 'startTimeUtc': '2021-01-01T23:57:54.12949Z',
 'endTimeUtc': '2021-01-02T00:36:42.402622Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'clusterfrank',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"ca39cfc8-1db3-42e6-8d02-0a65d8d9ffd8\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"PHSA-DARE-M-SBOX-CC-RG-TD-001\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"bd4cbcbc-2f34-4d26-94a7-6bf4b7560285\\\\\\", \\\\

In [16]:
# run.wait_for_completion(show_output=True)

In [17]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, best_model = run.get_output()
print(best_run)
print(best_model)

Run(Experiment: udacity-project,
Id: AutoML_6e977ee5-8502-4653-bf93-424a3780bdd0_29,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               objective='reg:logistic',
                                                                                               random_state=0,
                               

In [18]:
best_run.download_file('outputs/model.pkl', './automl_best_model.pkl')
best_run_metrics = best_run.get_metrics()
best_run_arguments = best_run.get_details()['runDefinition']['arguments']

In [19]:
best_model[1]

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('0',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=1.0,
                                                                               importance_type='split',
                                                                               learning_rate=0.1,
                                                                               max_

In [20]:
print(f"AUC ROC weighted = {best_run_metrics['AUC_weighted']}, accuracy = {best_run_metrics['accuracy']}")

AUC ROC weighted = 0.9497546999664417, accuracy = 0.9158118361153262


In [21]:
best_run.register_model(
    model_path = 'outputs/model.pkl',
    model_name = 'automl_best_model',
    tags = {'training context': 'AutoML'},
    properties = {
        'AUC': best_run_metrics['AUC_weighted'],
        'accuracy': best_run_metrics['accuracy']
    }
)

Model(workspace=Workspace.create(name='PHSA-DARE-M-SBOX-ML-001', subscription_id='bd4cbcbc-2f34-4d26-94a7-6bf4b7560285', resource_group='PHSA-DARE-M-SBOX-CC-RG-TD-001'), name=automl_best_model, id=automl_best_model:1, version=1, tags={'training context': 'AutoML'}, properties={'AUC': '0.9497546999664417', 'accuracy': '0.9158118361153262'})

In [22]:
run.cancel()

In [23]:
cluster.delete()