In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-227206
Azure region: westeurope
Subscription id: a24a24d5-8d87-4c8a-99b6-91ed2d2df51f
Resource group: aml-quickstarts-227206


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "project-one-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
try:
    aml_compute = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [63]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform, normal
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling({"batch_size": choice(16, 32, 64, 128), 
                               "learning_rate": normal(10, 3), 
                               "keep_probability": uniform(0.05, 0.1)})



# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job

args = ['--max_iter', 100, '--c', 1.0]

src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=aml_compute,
                      arguments=args,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [64]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(config=hyperdrive_config)

In [55]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['Arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n learning rate:',parameter_values[3])
print('\n keep probability:',parameter_values[5])
print('\n batch size:',parameter_values[7])

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_e9c06fa1-bb54-45cc-9aa2-6e08ddd6efda
Web View: https://ml.azure.com/runs/HD_e9c06fa1-bb54-45cc-9aa2-6e08ddd6efda?wsid=/subscriptions/a24a24d5-8d87-4c8a-99b6-91ed2d2df51f/resourcegroups/aml-quickstarts-227206/workspaces/quick-starts-ws-227206&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Execution Summary
RunId: HD_e9c06fa1-bb54-45cc-9aa2-6e08ddd6efda
Web View: https://ml.azure.com/runs/HD_e9c06fa1-bb54-45cc-9aa2-6e08ddd6efda?wsid=/subscriptions/a24a24d5-8d87-4c8a-99b6-91ed2d2df51f/resourcegroups/aml-quickstarts-227206/workspaces/quick-starts-ws-227206&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Execution failed. User process '/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/bin/python' exited with status code 2. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):
  File "train.py", lin



ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process '/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/bin/python' exited with status code 2. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\n  File \"train.py\", line 76, in <module>\n    main()\n  File \"train.py\", line 47, in main\n    args = parser.parse_args()\n  File \"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\", line 1733, in parse_args\n    self.error(msg % ' '.join(argv))\n  File \"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\", line 2389, in error\n    self.exit(2, _('%(prog)s: error: %(message)s\\n') % args)\n  File \"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\", line 2376, in exit\n    _sys.exit(status)\nSystemExit: 2\n\n Marking the experiment as failed because initial child jobs have failed due to user error",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Execution failed. User process '/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/bin/python' exited with status code 2. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\\n  File \\\"train.py\\\", line 76, in <module>\\n    main()\\n  File \\\"train.py\\\", line 47, in main\\n    args = parser.parse_args()\\n  File \\\"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\\\", line 1733, in parse_args\\n    self.error(msg % ' '.join(argv))\\n  File \\\"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\\\", line 2389, in error\\n    self.exit(2, _('%(prog)s: error: %(message)s\\\\n') % args)\\n  File \\\"/azureml-envs/azureml_7eec2c8971b9410f92147a7e257297e7/lib/python3.6/argparse.py\\\", line 2376, in exit\\n    _sys.exit(status)\\nSystemExit: 2\\n\\n Marking the experiment as failed because initial child jobs have failed due to user error\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

# AutoML Part

In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
df = TabularDatasetFactory().from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


In [8]:
# from train import clean_data

# # Use the clean_data function to clean your data.
# x, y = clean_data(df)

In [9]:
from azureml.train.automl.utilities import get_primary_metrics
get_primary_metrics('classification')

['average_precision_score_weighted',
 'precision_score_weighted',
 'AUC_weighted',
 'norm_macro_recall',
 'accuracy']

In [26]:
import pandas as pd

all_data = pd.concat([x, y], axis='columns')
all_data

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,no,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,unknown,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,no,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,no,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,no,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,56,1,no,0,1,7,1,116,1,999,...,0,1,0,0,0,0,0,0,0,0
32946,37,1,no,0,1,7,5,69,7,999,...,0,0,0,0,0,0,0,1,0,0
32947,26,0,no,0,0,5,2,135,4,999,...,0,0,0,0,0,0,0,1,0,0
32948,31,0,no,0,0,4,1,386,1,999,...,0,0,0,1,0,0,0,0,0,0


In [33]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

# Example Code
# automl_config = AutoMLConfig(
#     experiment_timeout_minutes=30,
#     task='classification',
#     primary_metric='accuracy',
#     X=x,
#     y=y,
#     training_data=all_data,
#     iterations=30,
#     label_column_name='y',
#     n_cross_validations=5)

# What I tried
# automl_config = AutoMLConfig(task="classification",
#                              experiment_timeout_minutes=30,
#                              X=x,
#                              y=y,
#                              iterations=30,
#                              iteration_timeout_minutes=5,
#                              primary_metric="accuracy",
#                              label_column_name='y',
#                              n_cross_validations=5
#                             )


# New code
automl_settings = {
    "experiment_timeout_hours" : 0.5,
    "iteration_timeout_minutes" : 10,
    "iterations" : 30,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automated_ml_errors.log',
                             path = '.',
                             compute_target = aml_compute,
                             featurization = 'auto',
                             training_data = df,
                             label_column_name = 'y',
                             **automl_settings)
                             
print("AutoML config created.")

AutoML config created.


In [34]:
# My old code
# automl_settings = {
#     "iterations": 30,
#     "iteration_timeout_minutes": 10,
#     "experiment_timeout_hours": 0.3,
#     "enable_early_stopping": True,
#     "primary_metric": 'accuracy',
#     "featurization": 'auto',
#     "verbosity": logging.INFO,
#     "n_cross_validations": 5
# }

# automl_config = AutoMLConfig(task='classification',
#                              debug_log='automated_ml_errors.log',
#                              training_data=all_data,
#                              label_column_name="y",
#                              **automl_settings)

In [35]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on project-one-cluster with default configuration
Running on remote compute: project-one-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_7109ba33-356d-4240-87ae-8d272116f413,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+-------------------------------------

In [None]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)
assert(automl_run.get_status() == "Completed")

best_run = automl_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['Arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n learning rate:',parameter_values[3])
print('\n keep probability:',parameter_values[5])
print('\n batch size:',parameter_values[7])

# Cleanup

In [None]:
ws.delete(delete_dependent_resources=True, no_wait=False)

In [None]:
aml_compute.delete()