In [42]:
import sys, os

import azureml.core as aml
import numpy as np
from azureml.core import Environment, Experiment, ScriptRunConfig, Workspace
from azureml.core.runconfig import DockerConfiguration


from dotenv import load_dotenv

load_dotenv()
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
AZURE_RESOURCE_GROUP = os.getenv("AZURE_RESOURCE_GROUP")
AZURE_WORKSPACE_NAME = os.getenv("AZURE_WORKSPACE_NAME")


In [43]:
experiment_name = "oc-p8-experiment-2"
model_choices = ["unet_xception"]
augment_choices = [False]
resize_choices = [64]

source_directory = "./train"
train_script = "train.py"
compute_target = "ocp8-cluster-gpu"


# connect to your workspace
ws = Workspace(
    subscription_id=AZURE_SUBSCRIPTION_ID,
    resource_group=AZURE_RESOURCE_GROUP,
    workspace_name=AZURE_WORKSPACE_NAME,
)


# ! MUST use CuDNN v8.0 => Python 3.6-3.8 , TensorFlow 2.4
# Error :
#   Loaded runtime CuDNN library: 8.0.5 but source was compiled with: 8.1.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
#   OP_REQUIRES failed at conv_ops.cc:1120 : UNKNOWN: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
#   [stderr]Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above. [Op:Conv2D]
# Solution : https://www.tensorflow.org/install/source#gpu
#   cuDNN 8.0 only compatible with :
#   - TensorFlow : 2.4.0
#   - Python : 3.6-3.8
#   - CUDA : 11.0

# AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu
# AzureML-tensorflow-2.5-ubuntu20.04-py38-cuda11-gpu
# AzureML-tensorflow-2.6-ubuntu20.04-py38-cuda11-gpu
# AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu
# env = Environment.get(
#     workspace=ws,
#     name="AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu",
# )


# env = Environment.from_conda_specification(
#     name=experiment_name, file_path="train/conda_dependencies.yml"
# )
# env.docker.base_image = "tensorflow/tensorflow:2.4.3-gpu"
# env.python.user_managed_dependencies = True


try:
    env = Environment.get(workspace=ws, name=experiment_name)
except:
    env = Environment(name=experiment_name)
    env.docker.base_image = None
    env.docker.base_dockerfile = "./train/Dockerfile"
    env.python.user_managed_dependencies = True
    env.register(workspace=ws)


for model in model_choices:
    for augment in augment_choices:
        for resize in resize_choices:
            run_config = ScriptRunConfig(
                source_directory=source_directory,
                script=train_script,
                arguments=[
                    "--experiment",
                    experiment_name,
                    "--resize",
                    resize,
                    "--augment",
                    augment,
                    "--model",
                    model,
                ],
                compute_target=compute_target,
                environment=env,
                docker_runtime_config=DockerConfiguration(
                    use_docker=True,
                    # arguments=[
                    #     "--privileged"  # required for mounting : https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.file_dataset.filedataset?view=azure-ml-py#azureml-data-file-dataset-filedataset-mount
                    # ],
                ),
            )

            # create an experiment
            exp = Experiment(workspace=ws, name=experiment_name)

            # submit the run configuration to start the job
            run = exp.submit(run_config)
            run.tag("model", model)
            run.tag("augment", augment)
            run.tag("resize", resize)

            print(f"Submitted Run : {run.display_name}")
            print(f"Tags : {run.get_tags()}")


Converting non-string tag to string: (augment: False)
Converting non-string tag to string: (resize: 64)


Submitted Run : sweet_chaconia_nkf9m2g8
Tags : {'model': 'unet_xception', 'augment': 'False', 'resize': '64'}
