In [2]:
# Handle to workspace
from azure.ai.ml import MLClient

# Authentitation package
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

# Make sure to login on terminal with az login

# Enter your credentials
subscription_id = "7a985d8d-5197-4c86-88dd-92eaf6eb8332"
resource_group = "Deeptech_Ready_Program"
workspace = "fatiu-us-ml-workspace"

ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

# Test connection
for workspace in ml_client.workspaces.list():
    print(f"Workspace Name: {workspace.name}, Location: {workspace.location}")


Workspace Name: fatiu-us-ml-workspace, Location: eastus


In [4]:
# Use compute instance

# Import compute entity
from azure.ai.ml.entities import AmlCompute

# My existing gpu cluster
gpu_compute_target = "fatiu-compute-cluster"

# Does compute target exist?
try:
    gpu_cluster = ml_client.compute.get(gpu_compute_target)
    print(
        f"You already have a cluster named (gpu_compute_target)"
    )

except Exception:
    print("Creating a new gpu compute target...")

You already have a cluster named (gpu_compute_target)


In [6]:
# Use curated enviroment
curated_env_name = "AzureML-tensorflow-2.12-cuda11@latest"

In [8]:
# MNIST data for training in azure storage account in the format .ubyte.gz
web_path = "wasbs://mldatacontainer@fatiustorageaccountus.blob.core.windows.net/MNIST"

In [10]:
# Build command job to train model
from azure.ai.ml import command
from azure.ai.ml.entities  import UserIdentityConfiguration
from azure.ai.ml import Input

web_path = "wasbs://mldatacontainer@fatiustorageaccountus.blob.core.windows.net/MNIST"

job = command(
    inputs=dict(
        data_folder=Input(type="uri_folder", path=web_path),
        batch_size=64,
        first_layer_neurons=256,
        second_layer_neurons=128,
        learning_rate=0.01,
    ),
    compute=gpu_compute_target,
    environment=curated_env_name,
    code="./",
    # Training script
    # The script preprocesses the data, splits it and uses it to train the model and return an output model.
    # The script creates a simple Deep Neural Network (DNN) with 2 hidden layers. The input layer has 28x28 = 784 neurons (each representing a pixel in and image) and the output layer has 10 neurons (each representing label 0 - 9)
    command="python deep_learning_training_script.py --data-folder ${{inputs.data_folder}} --batch-size ${{inputs.batch_size}} --first-layer-neurons ${{inputs.first_layer_neurons}} --second-layer-neurons ${{inputs.second_layer_neurons}} --learning-rate ${{inputs.learning_rate}}",
    experiment_name="tf-dnn-image-classify",
    display_name="tensorflow-classify-mnist-digit-images-with-dnn",
)

In [12]:
ml_client.jobs.create_or_update(job)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading MNIST handwriting (0.04

Experiment,Name,Type,Status,Details Page
tf-dnn-image-classify,cyan_cherry_07qp56rt80,command,Starting,Link to Azure Machine Learning studio


In [14]:
# SWEEP/Hyperparameter Tuning
from azure.ai.ml.sweep import Choice, LogUniform
# Using the command job as a function
job_for_sweep = job(
    batch_size = Choice(values=[32, 64, 128]),
    first_layer_neurons = Choice(values=[16, 64, 128, 256, 512]),
    second_layer_neurons = Choice(values=[16, 64, 256, 512]),
    learning_rate = LogUniform(min_value=-6, max_value=-1),
)

In [16]:
# Configure sweep
from azure.ai.ml.sweep import BanditPolicy

sweep_job = job_for_sweep.sweep(
    compute=gpu_compute_target,
    sampling_algorithm="random",
    primary_metric="validation_acc",
    goal="Maximize",
    max_total_trials=8,
    max_concurrent_trials=4,
    early_termination_policy=BanditPolicy(slack_factor=0.1, evaluation_interval=2),
)

In [18]:
# Submit sweep job
returned_sweep_job = ml_client.create_or_update(sweep_job)
# Stream the output and wait until the job is finished
ml_client.jobs.stream(returned_sweep_job.name)
# Refresh the latest status of the job after streaming
returned_sweep_job = ml_client.jobs.get(name=returned_sweep_job.name)

RunId: helpful_plum_wny57srfs2
Web View: https://ml.azure.com/runs/helpful_plum_wny57srfs2?wsid=/subscriptions/7a985d8d-5197-4c86-88dd-92eaf6eb8332/resourcegroups/Deeptech_Ready_Program/workspaces/fatiu-us-ml-workspace

Streaming azureml-logs/hyperdrive.txt

[2025-07-01T21:26:34.8699532Z][GENERATOR][DEBUG]Sampled 4 jobs from search space 
[2025-07-01T21:26:35.2456118Z][SCHEDULER][INFO]Scheduling job, id='helpful_plum_wny57srfs2_0' 
[2025-07-01T21:26:35.2998316Z][SCHEDULER][INFO]Scheduling job, id='helpful_plum_wny57srfs2_1' 
[2025-07-01T21:26:35.3016479Z][SCHEDULER][INFO]Scheduling job, id='helpful_plum_wny57srfs2_2' 
[2025-07-01T21:26:35.3980151Z][SCHEDULER][INFO]Scheduling job, id='helpful_plum_wny57srfs2_3' 
[2025-07-01T21:26:36.3039144Z][SCHEDULER][INFO]Successfully scheduled a job. Id='helpful_plum_wny57srfs2_3' 
[2025-07-01T21:26:36.3892543Z][SCHEDULER][INFO]Successfully scheduled a job. Id='helpful_plum_wny57srfs2_1' 
[2025-07-01T21:26:36.5923471Z][SCHEDULER][INFO]Successfully s

In [20]:
# Find and register best model
from azure.ai.ml.entities import Model

if returned_sweep_job.status == "Completed":

    # Model with best result
    best_run = returned_sweep_job.properties["best_child_run_id"]

    # lets get the model from this run
    model = Model(
        # the script stores the model as "model"
        path="azureml://jobs/{}/outputs/artifacts/paths/outputs/model/".format(
            best_run
        ),
        name="run-model-example",
        description="Model created from run.",
        type="custom_model",
    )

else:
    print(
        "Sweep job status: {}. Please wait until it completes".format(
            returned_sweep_job.status
        )
    )

In [22]:
# Register model
registered_model = ml_client.models.create_or_update(model = model)