In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-239278
Azure region: westeurope
Subscription id: 6971f5ac-8af1-446e-8034-05acea24681f
Resource group: aml-quickstarts-239278


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "udacity-project"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
cluster = ComputeTarget.create(ws, cluster_name, config)

cluster

AmlCompute(workspace=Workspace.create(name='quick-starts-ws-239278', subscription_id='6971f5ac-8af1-446e-8034-05acea24681f', resource_group='aml-quickstarts-239278'), name=udacity-project, id=/subscriptions/6971f5ac-8af1-446e-8034-05acea24681f/resourceGroups/aml-quickstarts-239278/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-239278/computes/udacity-project, type=AmlCompute, provisioning_state=Creating, location=westeurope, tags={})

In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling({
        'C': uniform(20., 100.),
        'max_iter': choice(50, 85, 90, 100, 110, 125, 200)
    }

)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='.', script='train.py',
                      arguments=['--C', 1.0, '--max_iter', 100],
                      compute_target=cluster,
                      environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4
                    )

In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)

In [6]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_36110e35-93f8-406b-831c-aab87955ce82
Web View: https://ml.azure.com/runs/HD_36110e35-93f8-406b-831c-aab87955ce82?wsid=/subscriptions/6971f5ac-8af1-446e-8034-05acea24681f/resourcegroups/aml-quickstarts-239278/workspaces/quick-starts-ws-239278&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

[2023-07-26T09:51:35.572498][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2023-07-26T09:51:36.1698546Z][SCHEDULER][INFO]Scheduling job, id='HD_36110e35-93f8-406b-831c-aab87955ce82_0' 
[2023-07-26T09:51:36.2861062Z][SCHEDULER][INFO]Scheduling job, id='HD_36110e35-93f8-406b-831c-aab87955ce82_1' 
[2023-07-26T09:51:36.4581311Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_36110e35-93f8-406b-831c-aab87955ce82_0' 
[2023-07-26T09:51:36.4601251Z][SCHEDULER][INFO]Scheduling job, id='HD_36110e35-93f8-406b-831c-aab87955ce82_2' 
[2023-07-26T09:51:36.5603755Z][SCHEDULER][INFO]Scheduling job, id='HD_36110e35-93f8-406b-831c-aab87955ce82_3

{'runId': 'HD_36110e35-93f8-406b-831c-aab87955ce82',
 'target': 'udacity-project',
 'status': 'Completed',
 'startTimeUtc': '2023-07-26T09:51:34.791405Z',
 'endTimeUtc': '2023-07-26T10:01:11.622648Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '03c7dca8-5ae0-4660-a893-19a9d7a41033',
  'user_agent': 'python/3.8.5 (Linux-5.15.0-1040-azure-x86_64-with-glibc2.10) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.51.0',
  'space_size': 'infinite_space_size',
  'score': '0.907623209516873',
  'best_child_run_id': 'HD_36110e35-93f8-406b-831c-aab87955ce82_13',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_36110e35-93f8-406b-831c-aab87955ce82_13'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetr

In [7]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print(f'Best Run id: {best_run.id}')
print(f'Accuracy: {best_run_metrics["Accuracy"]}')
print(f'C: {parameter_values[5]}')
print(f'max_iter: {parameter_values[7]}')

model = best_run.register_model(model_name='hyperdrive_best_model', model_path='outputs/model.pkl')

# joblib.dump(model, 'hyperdrive_best_model.pkl') # I wanted to pickle the model, but it contains objects that cannot be serialized.

print(model)

Best Run id: HD_36110e35-93f8-406b-831c-aab87955ce82_13
Accuracy: 0.907623209516873
C: 89.06874378346083
max_iter: 90
Model(workspace=Workspace.create(name='quick-starts-ws-239278', subscription_id='6971f5ac-8af1-446e-8034-05acea24681f', resource_group='aml-quickstarts-239278'), name=hyperdrive_best_model, id=hyperdrive_best_model:11, version=11, tags={}, properties={})


In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')

In [9]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

# Split data in train and test subsets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=33)

In [10]:
import pandas as pd
from azureml.core import Dataset, Datastore

df_train = pd.concat([x_train, y_train], axis=1)
df_test = pd.concat([x_test, y_test], axis=1)

datastore = Datastore.get(ws, 'workspaceblobstore')
train_ds = Dataset.Tabular.register_pandas_dataframe(df_train, datastore, "train set", show_progress=True)
test_ds = Dataset.Tabular.register_pandas_dataframe(df_test, datastore, "test set", show_progress=True)

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/8f5bfc2b-4811-4bea-9739-29a31e1418d6/
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'job_admin.' -> 'job_admin_'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'education_basic.4y'

In [13]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.


automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    iterations=30,
    iteration_timeout_minutes=5,
    primary_metric='accuracy',
    training_data=train_ds,
    validation_data=test_ds,
    label_column_name='y',
    compute_target=cluster)

In [14]:
# Submit your automl run

automl_run = exp.submit(config=automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on udacity-project with default configuration
Running on remote compute: udacity-project


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_624f3753-a867-4e71-9f79-eebad1b7c94b,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in th

In [15]:
# Retrieve and save your best automl model.
print(f'Best Run id: {automl_run.id}')

_, aml_model = automl_run.get_output()
print(aml_model)

joblib.dump(aml_model, 'outputs/automl_model.pkl')

Best Run id: AutoML_624f3753-a867-4e71-9f79-eebad1b7c94b
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=False, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/moun...
                 PreFittedSoftVotingClassifier(classification_labels=array([0, 1]), estimators=[('24', Pipeline(memory=None, steps=[('standardscalerwrapper', StandardScalerWrapper(copy=True, with_mean=False, with_std=False)), ('xgboostclassifier', XGBoostClassifier(booster='gbtree', colsample_bytree=1, eta=0.05, gamma=0, max_depth=6, max_leaves=0, n_estimators=200, n_jobs=1, objective='reg:logistic', problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_state=0, reg_alpha=0.625, reg_lambda=0.8333333333333334, 

Package:azureml-automl-runtime, training version:1.52.0.post1, current version:1.51.0.post1
Package:azureml-core, training version:1.52.0, current version:1.51.0
Package:azureml-dataprep, training version:4.11.4, current version:4.10.8
Package:azureml-dataprep-rslex, training version:2.18.4, current version:2.17.12
Package:azureml-dataset-runtime, training version:1.52.0, current version:1.51.0
Package:azureml-defaults, training version:1.52.0, current version:1.51.0
Package:azureml-interpret, training version:1.52.0, current version:1.51.0
Package:azureml-mlflow, training version:1.52.0, current version:1.51.0
Package:azureml-pipeline-core, training version:1.52.0, current version:1.51.0
Package:azureml-responsibleai, training version:1.52.0, current version:1.51.0
Package:azureml-telemetry, training version:1.52.0, current version:1.51.0
Package:azureml-train-automl-client, training version:1.52.0, current version:1.51.0.post1
Package:azureml-train-automl-runtime, training version:1.

['outputs/automl_model.pkl']

In [17]:
aml_metrics = automl_run.get_metrics()
aml_metrics

{'experiment_status': ['DatasetEvaluation',
  'FeaturesGeneration',
  'DatasetFeaturization',
  'DatasetFeaturizationCompleted',
  'DatasetBalancing',
  'ModelSelection',
  'BestRunExplainModel',
  'ModelExplanationDataSetSetup',
  'PickSurrogateModel',
  'EngineeredFeatureExplanations',
  'EngineeredFeatureExplanations',
  'RawFeaturesExplanations',
  'RawFeaturesExplanations',
  'BestRunExplainModel'],
 'experiment_status_description': ['Gathering dataset statistics.',
  'Generating features for the dataset.',
  'Beginning to fit featurizers and featurize the dataset.',
  'Completed fit featurizers and featurizing the dataset.',
  'Performing class balancing sweeping',
  'Beginning model selection.',
  'Best run model explanations started',
  'Model explanations data setup completed',
  'Choosing LightGBM as the surrogate model for explanations',
  'Computation of engineered features started',
  'Computation of engineered features completed',
  'Computation of raw features started',


In [18]:
cluster.delete()

In [21]:
cluster

AmlCompute(workspace=Workspace.create(name='quick-starts-ws-239278', subscription_id='6971f5ac-8af1-446e-8034-05acea24681f', resource_group='aml-quickstarts-239278'), name=udacity-project, id=/subscriptions/6971f5ac-8af1-446e-8034-05acea24681f/resourceGroups/aml-quickstarts-239278/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-239278/computes/udacity-project, type=AmlCompute, provisioning_state=Deleting, location=westeurope, tags={})