In [1]:
# configure workspace and the experiment
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="quick-starts-ws-127598")
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code FAL7AL6VA to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-127790
Azure region: southcentralus
Subscription id: 4910dccd-0348-46c4-a51f-d8c85e078b14
Resource group: aml-quickstarts-127790


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

amlcompute_cluster_name = "cpu-cluster"

try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2", max_nodes=4)
    
    # create the cluster
    aml_compute = ComputeTarget.create(workspace=ws,
                                       name=amlcompute_cluster_name, 
                                       provisioning_configuration=compute_config)


aml_compute.wait_for_completion(show_output=True)

print(aml_compute.get_status().serialize())

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-11-25T00:31:15.105000+00:00', 'errors': None, 'creationTime': '2020-11-25T00:31:11.351925+00:00', 'modifiedTime': '2020-11-25T00:31:49.415159+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
# import relevant packages and libraries
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil
from azureml.core import Environment

# Specify parameter sampler
ps = RandomParameterSampling({
    '--C': uniform(0.001, 1.0),
    '--max_iter': choice(0, 10, 50, 100, 150, 200)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=3, slack_factor=0.1, delay_evaluation=3)

if "outputs" not in os.listdir():
    os.mkdir("./outputs")

train_script = "./outputs"
# Create a SKLearn estimator for use with train.py
# Copy training script into train_model directory
shutil.copy('train.py', train_script)

est = SKLearn(
    source_directory=train_script,
    compute_target=aml_compute,
    entry_script='train.py',
    framework_version='0.20.3'
    )

                                                    
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=4
)

In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_432e2571-820c-48c0-b7af-94a7c3ecbdca
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_432e2571-820c-48c0-b7af-94a7c3ecbdca?wsid=/subscriptions/4910dccd-0348-46c4-a51f-d8c85e078b14/resourcegroups/aml-quickstarts-127790/workspaces/quick-starts-ws-127790

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-25T00:32:22.507431][API][INFO]Experiment created<END>\n""<START>[2020-11-25T00:32:23.724311][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-25T00:32:24.123822][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-25T00:32:24.7725038Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-11-25T00:32:53.733804][GENERATOR][INFO]Max number of jobs '4' reached for experiment.<END>\n""<START>[2020-11-25T00:32:53.878319][GENERATOR][INFO]All jobs generated.<END>\n"<START>[2

{'runId': 'HD_432e2571-820c-48c0-b7af-94a7c3ecbdca',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-25T00:32:22.303065Z',
 'endTimeUtc': '2020-11-25T00:39:39.516929Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '54fcb7b9-5179-4274-a1f3-a3d3ae94a267',
  'score': '0.91442097596504',
  'best_child_run_id': 'HD_432e2571-820c-48c0-b7af-94a7c3ecbdca_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg127790.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_432e2571-820c-48c0-b7af-94a7c3ecbdca/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=SoVy1%2FVd3zyHs3thbgVASMdmZay%2BMVhScjSiTj%2FFJHM%3D&st=2020-11-25T00%3A29%3A46Z&se=2020-11-25T08%3A39%3A46Z&sp=r'}}

In [6]:
# get the best run
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

['--C', '0.8621373201900345', '--max_iter', '150']


In [7]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_0365bb8fe31def0a262078dd2d9550777e65cd1ccdb73dccc19de1c54903e673_d.txt', 'azureml-logs/65_job_prep-tvmps_0365bb8fe31def0a262078dd2d9550777e65cd1ccdb73dccc19de1c54903e673_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_0365bb8fe31def0a262078dd2d9550777e65cd1ccdb73dccc19de1c54903e673_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/104_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


----------------------------

--------------------------------------------------------------

In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

dataset = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [10]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

In [11]:
# split data into Train and Test Sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [12]:
# combine the training features and the label
import pandas as pd
train_df = pd.concat([x_train, y_train.to_frame(name="label")], axis=1)
validation_df = pd.concat([x_test, y_test.to_frame(name="label")], axis=1)

In [13]:
from azureml.core.dataset import Dataset

if not os.path.isdir('automl_data'):
    os.mkdir('automl_data')
    
# Save the train and validation data to a csv to be uploaded to the datastore
train_df.to_csv("automl_data/train_data.csv", index=False)
validation_df.to_csv("automl_data/validation_data.csv", index=False)


ds = ws.get_default_datastore()
ds.upload(src_dir='./automl_data', target_path='bankmarketing', overwrite=True, show_progress=True)

# Upload the training data as a tabular dataset for access during training on remote compute
train_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))
# validation_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))

label = "label"

Uploading an estimated of 2 files
Uploading ./automl_data/train_data.csv
Uploaded ./automl_data/train_data.csv, 1 files out of an estimated total of 2
Uploading ./automl_data/validation_data.csv
Uploaded ./automl_data/validation_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [14]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name="label",
    compute_target=aml_compute,
    n_cross_validations=5)

In [15]:
# Submit your automl run
automl_run = exp.submit(automl_config, show_output=True)

Running on remote.
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_5ad19727-f273-489b-934f-b13ff0fcb247

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--

In [16]:
best_run, fitted_model = automl_run.get_output()
print(best_run.get_file_names())

['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_9e680de46dab4d5740a1a59390b02f73dfe8ecfd1d35cd541e722737cb1e2ad0_d.txt', 'azureml-logs/65_job_prep-tvmps_9e680de46dab4d5740a1a59390b02f73dfe8ecfd1d35cd541e722737cb1e2ad0_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_9e680de46dab4d5740a1a59390b02f73dfe8ecfd1d35cd541e722737cb1e2ad0_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'explanation/512bb6f6/classes.interpret.json', 'explanation/512bb6f6/expected_values.interpret.json', 'explanation/512bb6f6/features.interpret.json', 'explanation/512bb6f6/global_names/0.interpret.json', 'explanation/512bb6f6/global_rank/0.interpret.json', 'explanation/512bb6f6/global_values/0.interpret.json', 'explanation/512bb6f6/local_importance_values.interpret.json', 'explanation/512bb6f6/per_class_names/0.interpret.json', 'explanation/512bb6f6/per_class_rank/0.interpret.json', 'explanation/512bb6f6/per

In [17]:
from pprint import pprint
def print_model(model, prefix=""):
  for step in model.steps:
      print(prefix + step[0])
      if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
          pprint({'estimators': list(
              e[0] for e in step[1].estimators), 'weights': step[1].weights})
          print()
          for estimator in step[1].estimators:
              print_model(estimator[1], estimator[0] + ' - ')
      else:
          pprint(step[1].get_params())
          print()
print_model(fitted_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['0', '1', '13', '9', '3', '12'],
 'weights': [0.5333333333333333,
             0.2,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667]}

0 - maxabsscaler
{'copy': True}

0 - lightgbmclassifier
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': 1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_l

In [18]:
aml_compute.delete()