In [3]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project-automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: mlwsp-morongo
Azure region: eastus
Subscription id: 6d280d78-8be4-469a-a32a-cef764370725
Resource group: rg_id01


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "compute-cpu-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

vm_size = "Standard_D2_V2"
max_nodes = 4

try:
    # Check if the cluster already exists
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print(f"Found existing compute target and it will be used: {cluster_name}")
except ComputeTargetException:
    print(f"Creating a new compute cluster: {cluster_name}")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size,
        max_nodes=max_nodes
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

# Get a detailed status ge_status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute cluster: compute-cpu-cluster
InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2024-11-30T21:43:06.114000+00:00', 'errors': None, 'creationTime': '2024-11-30T21:42:57.490992+00:00', 'modifiedTime': '2024-11-30T21:43:07.690280+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'Standard_D2_V2'}


In [43]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler to define the parameter space
param_space = {
    '--C': choice(0.01, 0.1, 1.0, 10.0),  # Specific choices for C
    '--max_iter': choice(50, 100, 150)    # Specific choices for max iterations
}
ps = RandomParameterSampling(param_space)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(
    source_directory='./',  
    script='train.py',             
    compute_target=compute_target, 
    environment=sklearn_env        # Environment for dependencies
)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    run_config=src,               
    hyperparameter_sampling=ps,  
    policy=policy,                # Early stopping policy
    primary_metric_name='AUC_weighted',  
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,  # Goal: maximize the metric
    max_total_runs=12,            
    max_concurrent_runs=4         
)


In [44]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config, 
                            tags={"job_name": "hyperdrive-logisticregression"},
                            display_name="Logistic Regression HyperDrive")
RunDetails(hyperdrive_run).show()
# Wait for completion
hyperdrive_run.wait_for_completion(show_output=True)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_3deeb79f-26f3-4898-b743-9136d7b49d93
Web View: https://ml.azure.com/runs/HD_3deeb79f-26f3-4898-b743-9136d7b49d93?wsid=/subscriptions/6d280d78-8be4-469a-a32a-cef764370725/resourcegroups/rg_id01/workspaces/mlwsp-morongo&tid=c9e7aaf6-b4dc-4abe-9589-821b04ec6915

Streaming azureml-logs/hyperdrive.txt

[2024-12-01T00:27:23.7621032Z][GENERATOR][DEBUG]Sampled 4 jobs from search space 
[2024-12-01T00:27:24.0256734Z][SCHEDULER][INFO]Scheduling job, id='HD_3deeb79f-26f3-4898-b743-9136d7b49d93_0' 
[2024-12-01T00:27:24.1179101Z][SCHEDULER][INFO]Scheduling job, id='HD_3deeb79f-26f3-4898-b743-9136d7b49d93_3' 
[2024-12-01T00:27:24.1190755Z][SCHEDULER][INFO]Scheduling job, id='HD_3deeb79f-26f3-4898-b743-9136d7b49d93_2' 
[2024-12-01T00:27:24.1201468Z][SCHEDULER][INFO]Scheduling job, id='HD_3deeb79f-26f3-4898-b743-9136d7b49d93_1' 
[2024-12-01T00:27:24.5444687Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_3deeb79f-26f3-4898-b743-9136d7b49d93_2' 
[2024-12-01T00:27:24.5961787Z][SCHEDULER

{'runId': 'HD_3deeb79f-26f3-4898-b743-9136d7b49d93',
 'target': 'compute-cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2024-12-01T00:27:22.070492Z',
 'endTimeUtc': '2024-12-01T00:31:56.639481Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"AUC_weighted","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '4b840a51-9f85-4321-8f46-9dd540f557f7',
  'user_agent': 'python/3.10.11 (Linux-5.15.0-1073-azure-x86_64-with-glibc2.31) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.57.0',
  'best_child_run_id': 'HD_3deeb79f-26f3-4898-b743-9136d7b49d93_9',
  'score': '0.9322341979071764',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_3deeb79f-26f3-4898-b743-9136d7b49d93_9'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlClientType': 'a

In [45]:
import joblib
# Get your best run and save the model from that run.
# Retrieve the model file from the best run
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run)
print("Best run metrics :",best_run.get_metrics())

Run(Experiment: udacity-project-automl,
Id: HD_3deeb79f-26f3-4898-b743-9136d7b49d93_9,
Type: azureml.scriptrun,
Status: Completed)
Best run metrics : {'Regularization Strength:': 1.0, 'Max iterations:': 150, 'Accuracy': 0.917298937784522, 'AUC_weighted': 0.9322341979071764}


In [5]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])
df = ds.to_pandas_dataframe()
print(df.head())  


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
   age          job  marital    education  default housing loan    contact  \
0   57   technician  married  high.school       no      no  yes   cellular   
1   55      unknown  married      unknown  unknown     yes   no  telephone   
2   33  blue-collar  married     basic.9y       no      no   no   cellular   
3   36       admin.  married  high.school       no      no   no  telephone   
4   27    housemaid  married  high.school       no     yes   no   cellular   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         1      failure         -1.8   
1   may         thu  ...         2    999         0  nonexistent          1.1   
2   may         fri  ...         1    999         1      failure         -1.8   
3   jun         fri  ...         4    99

In [6]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


In [11]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_settings = {
    "enable_early_stopping": True,
    "featurization": "auto",
    "model_explainability": True,
    "exclude_nan_labels":  True,
    "enable_onnx_compatible_models": True,
    "enable_voting_ensemble":True,
}
automl_config = AutoMLConfig(
    compute_target = compute_target,
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='AUC_weighted',
    training_data= ds,
    label_column_name="y",
    n_cross_validations=5,
    **automl_settings)

In [12]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(automl_config, show_output = False)
automl_run.wait_for_completion()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-automl,AutoML_079d449a-e6fa-4299-9e4f-78000bd72e14,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


{'runId': 'AutoML_079d449a-e6fa-4299-9e4f-78000bd72e14',
 'target': 'compute-cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2024-11-30T21:49:35.160771Z',
 'endTimeUtc': '2024-11-30T22:28:45.573876Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'compute-cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"5b1d5925-fac0-4cb0-8411-443fa2f9a37a\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': 

In [14]:
# Retrieve and save your best automl model.
best_run, fitted_model = automl_run.get_output()
### YOUR CODE HERE ###

In [15]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-automl,AutoML_079d449a-e6fa-4299-9e4f-78000bd72e14_30,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [27]:
model_name = best_run.properties["model_name"]
print(model_name)

AutoML079d449ae30


In [16]:
fitted_model

In [19]:
print(fitted_model)

Pipeline(steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, is_cross_validation=True, is_onnx_compatible=True, working_dir='/mnt/batch/tasks/shared/LS_root/mounts/clusters/etoledo1/code/Users/etoledo')),
                ('prefittedsoftvotingclassifier',
                 PreFittedSoftVotingClassifier(classification_labels=array([0, 1]), estimators=[('0', Pipeline(steps=[('maxabssc...=100, objective='reg:logistic', problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), reg_alpha=0.4166666666666667, reg_lambda=2.5, subsample=0.5, tree_method='hist'))]))], flatten_transform=False, weights=[0.3333333333333333, 0.13333333333333333, 0.26666666666666666, 0.06666666666666667, 0.06666666666666667, 0.13333333333333333]))])
Y_transformer(['LabelEncoder', LabelEncoder()])


In [23]:
automl_run.get_metrics()

{'experiment_status': ['DatasetEvaluation',
  'FeaturesGeneration',
  'DatasetFeaturization',
  'DatasetFeaturizationCompleted',
  'DatasetBalancing',
  'DatasetCrossValidationSplit',
  'ModelSelection',
  'BestRunExplainModel',
  'ModelExplanationDataSetSetup',
  'PickSurrogateModel',
  'EngineeredFeatureExplanations',
  'EngineeredFeatureExplanations',
  'RawFeaturesExplanations',
  'RawFeaturesExplanations',
  'BestRunExplainModel'],
 'experiment_status_description': ['Gathering dataset statistics.',
  'Generating features for the dataset.',
  'Beginning to fit featurizers and featurize the dataset.',
  'Completed fit featurizers and featurizing the dataset.',
  'Performing class balancing sweeping',
  'Generating individually featurized CV splits.',
  'Beginning model selection.',
  'Best run model explanations started',
  'Model explanations data setup completed',
  'Choosing LightGBM as the surrogate model for explanations',
  'Computation of engineered features started',
  'Comp

In [24]:
automl_run.get_details()

{'runId': 'AutoML_079d449a-e6fa-4299-9e4f-78000bd72e14',
 'target': 'compute-cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2024-11-30T21:49:35.160771Z',
 'endTimeUtc': '2024-11-30T22:28:45.573876Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'compute-cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"5b1d5925-fac0-4cb0-8411-443fa2f9a37a\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': 

In [25]:
#return_onnx_model=True to retrieve the best ONNX model, instead of the Python model.
from azureml.automl.runtime.onnx_convert import OnnxConverter
best_run, onnx_mdl = automl_run.get_output(return_onnx_model=True)
onnx_fl_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)
import os

# List only .onnx files in the current directory
onnx_files = [file for file in os.listdir(".") if file.endswith(".onnx")]

print("ONNX files in the current directory:")
for onnx_file in onnx_files:
    print(onnx_file)

ONNX files in the current directory:
best_model.onnx


In [26]:
best_run = automl_run.get_best_child()
model_name = best_run.properties["model_name"]
print(model_name)

AutoML079d449ae30


In [49]:
# cluster clean up

compute_target.delete()