In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-125087")
exp = Experiment(workspace=ws, name="quick-starts-ws-125087")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-125087
Azure region: southcentralus
Subscription id: da775cb9-9ca6-4943-ad21-26dfa99526fc
Resource group: aml-quickstarts-125087


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

#Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for your CPU cluster
cpu_cluster_name = "auto-ml"
# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [4]:
import shutil
import os

script_folder = './training'
os.makedirs(script_folder, exist_ok=True)

# the training  logic is in the train.py file.
shutil.copy('./train.py', script_folder)


'./training/train.py'

In [4]:
from azureml.train.sklearn import SKLearn

# Create a SKLearn estimator for use with train.py

script_params = {
    '--C': 1.0,
    '--max_iter': 100,
}

est = SKLearn(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py'
                   )

In [5]:
run = exp.submit(est)



In [6]:
from azureml.widgets import RunDetails

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [7]:
run.cancel()

In [13]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform


# Specify parameter sampler
ps = RandomParameterSampling( {
    "--C": choice([0.001,0.01,0.1,1.0,10.0,100.0,1000.0]),
    "--max_iter": choice([100,110,120,130,140])
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [14]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(config=hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_5e2fec7a-1945-40ff-8ece-5374d730e305
Web View: https://ml.azure.com/experiments/quick-starts-ws-124858/runs/HD_5e2fec7a-1945-40ff-8ece-5374d730e305?wsid=/subscriptions/e8f628b3-bb5b-4edf-947a-8637ca6ea7c2/resourcegroups/aml-quickstarts-124858/workspaces/quick-starts-ws-124858

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-06T13:07:30.474277][API][INFO]Experiment created<END>\n"<START>[2020-11-06T13:07:31.3948417Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2020-11-06T13:07:31.239117][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-06T13:07:31.604067][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_5e2fec7a-1945-40ff-8ece-5374d730e305
Web View: https://ml.azure.com/experiments/quick-starts-ws-124858/runs/HD_5e2fec7a-1945-40ff-8ece-5374d730e305?wsid=/subsc

In [15]:
#retreive best run
best_hdrive_run = hyperdrive_run.get_best_run_by_primary_metric()
best_hdrive_run_metrics = hyperdrive_run.get_metrics()
best_hdrive_run_children = hyperdrive_run.get_children_sorted_by_primary_metric()

In [19]:
print(best_hdrive_run_metrics)


{'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_0': {'Regularization Strength:': 0.001, 'Max iterations:': 130, 'Accuracy': 0.910065756196257}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_1': {'Regularization Strength:': 10.0, 'Max iterations:': 100, 'Accuracy': 0.9096611026808296}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_10': {'Regularization Strength:': 1.0, 'Max iterations:': 110, 'Accuracy': 0.9096611026808296}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_11': {'Regularization Strength:': 10.0, 'Max iterations:': 130, 'Accuracy': 0.9096611026808296}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_12': {'Regularization Strength:': 0.001, 'Max iterations:': 100, 'Accuracy': 0.910065756196257}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_13': {'Regularization Strength:': 100.0, 'Max iterations:': 100, 'Accuracy': 0.9096611026808296}, 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305_14': {'Regularization Strength:': 100.0, 'Max iterations:': 110, 'Accuracy': 0.9096611026808296}, 'HD_5e2fec7a-1945-40ff-8ece-5374d7

In [None]:
for x, y in best_hdrive_run_metrics.items():
    print("C:", y['Regularization Strength:'], "   ", "Accuracy:",  y['Accuracy'])

In [20]:
print(hyperdrive_run.get_details())


{'runId': 'HD_5e2fec7a-1945-40ff-8ece-5374d730e305', 'target': 'auto-ml', 'status': 'Completed', 'startTimeUtc': '2020-11-06T13:07:30.2798Z', 'endTimeUtc': '2020-11-06T13:18:27.235321Z', 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}', 'resume_from': 'null', 'runTemplate': 'HyperDrive', 'azureml.runsource': 'hyperdrive', 'platform': 'AML', 'ContentSnapshotId': '3cd3eb8f-1217-4184-8f7a-6ac46cd52b4b'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg124858.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_5e2fec7a-1945-40ff-8ece-5374d730e305/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=Nnj9Cut9TaPmrykLdlMWKnZqU0ix3oCceGgFW%2FxicGE%3D&st=2020-11-06T13%3A08%3A46Z&se=2020-11-06T21%3A18%3A46Z&sp=r'}}


In [21]:
print(hyperdrive_run.get_file_names())

['azureml-logs/hyperdrive.txt']


In [22]:
import joblib

#save best run
joblib.dump(best_hdrive_run, 'hdrive_model.joblib')

['hdrive_model.joblib']

In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dataset = Dataset.Tabular.from_delimited_files(path=data_path)

In [6]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

In [22]:
import pandas as pd
train_data = pd.concat([x, y], axis=1, sort=False)

In [24]:
#upload the cleaned marketing data to the default datastore (blob) of my workspace.

#first convert data to .csv
train_data.to_csv('./training/train_data.csv',header=True)

#Then upload to datastore
datastore = ws.get_default_datastore()
datastore.upload_files(['training/train_data.csv'], target_path='training', overwrite=True)


Uploading an estimated of 1 files
Uploading training/train_data.csv
Uploaded training/train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_7d7e0be680be4844a12213af0cbb9112

In [26]:
#convert back to tabular dataset for running in AutoML
train_data = Dataset.Tabular.from_delimited_files(path = [(datastore, 'training/train_data.csv')])
label = "y"

In [51]:
from azureml.train.automl import AutoMLConfig

automl_settings = {   
    "experiment_timeout_minutes": 30,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 7,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             training_data = train_data,
                             label_column_name = label,
                             **automl_settings
                            )

In [52]:
# Submit your automl run

automl_run = exp.submit(automl_config, show_output = False)

Running on remote.


In [32]:
# Retrieve and save your best automl model.

best_automl_run_metrics = automl_run.get_metrics()

In [33]:
print(best_automl_run_metrics)

{'experiment_status': ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection', 'BestRunExplainModel', 'ModelExplanationDataSetSetup', 'PickSurrogateModel', 'EngineeredFeatureExplanations', 'EngineeredFeatureExplanations', 'RawFeaturesExplanations', 'RawFeaturesExplanations', 'BestRunExplainModel'], 'experiment_status_description': ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.', 'Best run model explanations started', 'Model explanations data setup completed', 'Choosing LightGBM as the surrogate model for explanations', 'Computation of engineered features started', 'Computation of engineered features completed', 'Computa

In [53]:
print("Best AutoML model Accuracy: ", best_automl_run_metrics['accuracy'])

Best AutoML model Accuracy:  0.916813353566009


In [44]:
best_run, fitted_model = automl_run.get_output()

In [50]:
#save best run
joblib.dump(fitted_model, 'automl_model.joblib')

['automl_model.joblib']

In [None]:
compute_target.delete()