## Connect to Azure Workspace

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication(tenant_id="660b3398-b80e-49d2-bc5b-ac1dc93b5254")

ws = Workspace(subscription_id="30d182b7-c8c4-421c-8fa0-d3037ecfe6d2",
               resource_group="aml-quickstarts-127691",
               workspace_name="quick-starts-ws-127691",
               auth=interactive_auth)

exp = Experiment(workspace=ws, name="udacity-project")
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()



Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-127691
Azure region: southcentralus
Subscription id: 30d182b7-c8c4-421c-8fa0-d3037ecfe6d2
Resource group: aml-quickstarts-127691


## Create Compute Target

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cpu_cluster_name = "udacity-cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Optimizing model using HyperDrive

In [None]:
#!pip install 'azureml-sdk[notebooks]'

In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
      '--C': uniform( 0.2, 2),
      '--max_iter': choice(100, 150, 200, 250, 300)
})

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 2, delay_evaluation = 5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = './', entry_script = 'train.py', compute_target = cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
                                     hyperparameter_sampling = ps,
                                     policy = policy,
                                     primary_metric_name = "Accuracy",
                                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 40,
                                     max_concurrent_runs = 4)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_b4c020a2-973c-4ad3-b383-2c126509595f
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_b4c020a2-973c-4ad3-b383-2c126509595f?wsid=/subscriptions/30d182b7-c8c4-421c-8fa0-d3037ecfe6d2/resourcegroups/aml-quickstarts-127691/workspaces/quick-starts-ws-127691

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-24T11:13:51.692763][API][INFO]Experiment created<END>\n""<START>[2020-11-24T11:13:52.496024][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-24T11:13:52.824322][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-24T11:13:53.1918433Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_b4c020a2-973c-4ad3-b383-2c126509595f
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_b4c020a2-973c-4ad3-b383-2c126509595f?wsid=/subscriptions/30d18

{'runId': 'HD_b4c020a2-973c-4ad3-b383-2c126509595f',
 'target': 'udacity-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-24T11:13:51.464016Z',
 'endTimeUtc': '2020-11-24T11:37:11.506665Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '84bf0b30-fff1-47fe-a49b-24ba517b4254',
  'score': '0.9148710166919575',
  'best_child_run_id': 'HD_b4c020a2-973c-4ad3-b383-2c126509595f_26',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg127691.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_b4c020a2-973c-4ad3-b383-2c126509595f/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=dPie81Z0bFSX4QR5bUIlsX7B1IWdPlvU9BiXsPXdHcg%3D&st=2020-11-24T11%3A27%3A23Z&se=2020-11-24T19%3A37%3A23Z&sp=r'}}

In [5]:
# Get your best run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print('Accuracy:', best_run_metrics['Accuracy'])
print('Parameter Values:', parameter_values)

Best Run Id:  HD_b4c020a2-973c-4ad3-b383-2c126509595f_26
Accuracy: 0.9148710166919575
Parameter Values: ['--C', '0.8997018907090413', '--max_iter', '100']


In [6]:
import joblib
# Save the model from the best run.
joblib.dump(value=best_run.get_details()['runDefinition']['arguments'], 
            filename=os.path.join('outputs', 'best_hyperdrive_model.joblib'))

['outputs/best_hyperdrive_model.joblib']

## Optimizing model using Azure AutoML

In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dataset = TabularDatasetFactory.from_delimited_files(path=url)
pd_data = dataset.to_pandas_dataframe()

# dimensions
m, k = pd_data.shape

print("{} x {} table of data:".format(m, k))
display(pd_data.head())
print("...")

32950 x 21 table of data:


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


...


In [7]:
#!pip3 install -U scikit-learn scipy matplotlib

Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp38-cp38-macosx_10_9_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 2.6 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl (29.0 MB)
[K     |████████████████████████████████| 29.0 MB 1.4 MB/s eta 0:00:01
[?25hCollecting matplotlib
  Downloading matplotlib-3.3.3-cp38-cp38-macosx_10_9_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 199 kB/s eta 0:00:01
[?25hCollecting joblib>=0.11
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 637 kB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: joblib, scipy, threadpoolctl, scikit-learn, matplotlib
  Attempting uninstall: scipy
    Found existing installation: scipy 1.5.2
    Uninstalling scipy-1.5.2:
      Successfully uninstalled scipy-1.5.2
  Atte

In [8]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
train_data = x_train.join(y_train)

In [17]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=5)

In [18]:
# Submit your automl run
from azureml.widgets import RunDetails
from azureml.core.experiment import Experiment

expirement = Experiment(workspace=ws, name="automl_remote")  
auto_run = expirement.submit(automl_config, show_output = True)

ValidationException: ValidationException:
	Message: Install the required versions of packages using the requirements file. Requirements file location: /Users/franckess/anaconda3/envs/udacity/lib/python3.8/site-packages/azureml/automl/core/validated_darwin_requirements.txt. Alternatively, use remote target to avoid dependency management. 
Required version/Installed version
azure-mgmt-resource<=10.2.0/azure-mgmt-resource 10.3.0
cryptography<=3.1.1/cryptography 3.2.1
joblib<=0.14.1/joblib 0.17.0
numpy<=1.18.5/numpy 1.19.2
pandas<=0.25.3/pandas 1.1.4
scikit-learn<=0.22.2.post1/scikit-learn 0.23.2
scipy<=1.5.2/scipy 1.5.4
tqdm<=4.50.2/tqdm 4.53.0
zipp<=3.3.1/zipp 3.4.0
	InnerException: None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Install the required versions of packages using the requirements file. Requirements file location: /Users/franckess/anaconda3/envs/udacity/lib/python3.8/site-packages/azureml/automl/core/validated_darwin_requirements.txt. Alternatively, use remote target to avoid dependency management. \nRequired version/Installed version\nazure-mgmt-resource<=10.2.0/azure-mgmt-resource 10.3.0\ncryptography<=3.1.1/cryptography 3.2.1\njoblib<=0.14.1/joblib 0.17.0\nnumpy<=1.18.5/numpy 1.19.2\npandas<=0.25.3/pandas 1.1.4\nscikit-learn<=0.22.2.post1/scikit-learn 0.23.2\nscipy<=1.5.2/scipy 1.5.4\ntqdm<=4.50.2/tqdm 4.53.0\nzipp<=3.3.1/zipp 3.4.0",
        "inner_error": {
            "code": "NotSupported",
            "inner_error": {
                "code": "IncompatibleOrMissingDependency"
            }
        },
        "reference_code": "65e7ad52-ad1f-4915-85ad-440d0a5e221f"
    }
}

In [23]:
expirement.workspace

Workspace.create(name='quick-starts-ws-127691', subscription_id='30d182b7-c8c4-421c-8fa0-d3037ecfe6d2', resource_group='aml-quickstarts-127691')