## Connect to Azure Workspace

In [12]:
from azureml.core import Workspace, Experiment
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication(tenant_id="660b3398-b80e-49d2-bc5b-ac1dc93b5254")

ws = Workspace(subscription_id="fb968fd6-afa4-4fb2-8296-1d0120d715b4",
               resource_group="aml-quickstarts-127927",
               workspace_name="quick-starts-ws-127927",
               auth=interactive_auth)

exp = Experiment(workspace=ws, name="udacity-project")
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()



Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-127927
Azure region: southcentralus
Subscription id: fb968fd6-afa4-4fb2-8296-1d0120d715b4
Resource group: aml-quickstarts-127927


## Create Compute Target

In [13]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

aml_name = "udacity-cluster"
try:
    aml_compute = AmlCompute(ws, aml_name)
    print('Found existing AML compute context.')
except:
    print('Creating new AML compute context.')
    aml_config = AmlCompute.provisioning_configuration(vm_size = "Standard_D2_v2", min_nodes=1, max_nodes=4)
    aml_compute = AmlCompute.create(ws, name = aml_name, provisioning_configuration = aml_config)
    aml_compute.wait_for_completion(show_output = True)

Creating new AML compute context.
Creating
Succeeded........
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Optimizing model using HyperDrive

In [None]:
#!pip install 'azureml-sdk[notebooks]'

In [18]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
      '--C': uniform( 0.2, 5),
      '--max_iter': choice(100, 150, 200, 250, 300, 400)
})

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 2, delay_evaluation = 5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = './', entry_script = 'train.py', compute_target = aml_name)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
                                     hyperparameter_sampling = ps,
                                     policy = policy,
                                     primary_metric_name = "Accuracy",
                                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 40,
                                     max_concurrent_runs = 4)

In [19]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_8e93eda4-1af5-4010-bf97-f6a452e27295
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_8e93eda4-1af5-4010-bf97-f6a452e27295?wsid=/subscriptions/fb968fd6-afa4-4fb2-8296-1d0120d715b4/resourcegroups/aml-quickstarts-127927/workspaces/quick-starts-ws-127927

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-25T23:52:49.568995][API][INFO]Experiment created<END>\n""<START>[2020-11-25T23:52:50.199495][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-25T23:52:50.386896][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-25T23:52:51.2939385Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_8e93eda4-1af5-4010-bf97-f6a452e27295
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_8e93eda4-1af5-4010-bf97-f6a452e27295?wsid=/subscriptions/fb968

{'runId': 'HD_8e93eda4-1af5-4010-bf97-f6a452e27295',
 'target': 'udacity-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-25T23:52:49.336295Z',
 'endTimeUtc': '2020-11-26T00:19:40.552201Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'f95b6e58-5f0f-430a-a595-76572a254e07',
  'score': '0.9074355083459787',
  'best_child_run_id': 'HD_8e93eda4-1af5-4010-bf97-f6a452e27295_8',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg127927.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_8e93eda4-1af5-4010-bf97-f6a452e27295/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=00SLDLCt%2BA2KU6woznTrS0Zd%2FxcgPxl%2FAzeTegDbe5o%3D&st=2020-11-26T00%3A09%3A59Z&se=2020-11-26T08%3A19%3A59Z&sp=r'}}

In [20]:
# Get your best run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print('Accuracy:', best_run_metrics['Accuracy'])
print('Parameter Values:', parameter_values)

Best Run Id:  HD_8e93eda4-1af5-4010-bf97-f6a452e27295_8
Accuracy: 0.9074355083459787
Parameter Values: ['--C', '2.750678484778166', '--max_iter', '300']


In [21]:
import joblib
# Save the model from the best run.
joblib.dump(value=best_run.get_details()['runDefinition']['arguments'], 
            filename=os.path.join('outputs', 'best_hyperdrive_model.joblib'))

['outputs/best_hyperdrive_model.joblib']

## Optimizing model using Azure AutoML

In [22]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=url)

In [23]:
pd_data = ds.to_pandas_dataframe()
# dimensions
m, k = pd_data.shape
print("{} x {} table of data:".format(m, k))
display(pd_data.head())
print("...")

32950 x 21 table of data:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


...


In [24]:
from azureml.core import Workspace, Dataset
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x['target'] = y
local_path = 'dataset/DT.csv'
x.to_csv(local_path)

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='dataset', target_path='data')

# create a dataset referencing the cloud location
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/DT.csv'))])

Uploading an estimated of 1 files
Uploading dataset/DT.csv
Uploaded dataset/DT.csv, 1 files out of an estimated total of 1
Uploaded 1 files


In [25]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=dataset,
    compute_target=cpu_cluster_name,
    label_column_name='target',
    n_cross_validations=5)

In [26]:
# Submit your automl run
from azureml.widgets import RunDetails
from azureml.core.experiment import Experiment

expirement = Experiment(workspace=ws, name="automl_remote")  
auto_run = expirement.submit(automl_config, show_output = True)
RunDetails(auto_run).show()
auto_run.wait_for_completion(show_output=True)

Running on remote.
Running on remote compute: udacity-cluster
Parent Run ID: AutoML_38e8b43f-ebef-4092-905f-3450601be4ae

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class 

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3692                             |1                                |32950                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_38e8b43f-ebef-4092-905f-3450601be4ae',
 'target': 'udacity-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-26T00:50:01.228682Z',
 'endTimeUtc': '2020-11-26T01:31:18.560202Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'udacity-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"dfb1b7e9-4142-4bad-8b0a-d2e1acc2f1f4\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/DT.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-127927\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"fb968fd6-afa4-4fb2-8296-1d0120d715b4\\\\\\", \\\\\\"work

#### !pip install azureml-train-automl-runtime==1.17.0

In [27]:
## Validate our highest performing model
best_run, fitted_model = auto_run.get_output()
print(best_run)
print(fitted_model)



Run(Experiment: automl_remote,
Id: AutoML_38e8b43f-ebef-4092-905f-3450601be4ae_25,
Type: azureml.scriptrun,
Status: Completed)
None


In [None]:
## Delete compute cluster
aml_compute.delete()