In [1]:
import os 

import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Experiment, Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
#from azureml.core.runconfig import PyTorchConfiguration
from azureml.core.runconfig import PyTorchConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

from azureml.data import OutputFileDatasetConfig
from azureml.telemetry import set_diagnostics_collection

from azureml.widgets import RunDetails

set_diagnostics_collection(send_diagnostics=True)

print("SDK version:", azureml.core.VERSION)

Turning diagnostics collection on. 
SDK version: 1.37.0


In [2]:
project_folder = './dino'
os.makedirs(project_folder, exist_ok=True)

In [3]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()
dataset = Dataset.get_by_name(ws, name='imagenet_2015_premium_west_europe')


In [4]:
# choose a name for your cluster
cluster_name = 'A100-8x2'
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Cannot Find the compute cluster')

# use get_status() to get a detailed status for the current AmlCompute. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2022-02-22T12:10:55.216000+00:00', 'errors': None, 'creationTime': '2022-02-16T07:05:48.277388+00:00', 'modifiedTime': '2022-02-22T03:10:23.859807+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT60S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_ND96AMSR_A100_V4'}


In [5]:
experiment_name = 'exp-A100'
experiment = Experiment(ws, name=experiment_name)

myenv = Environment(name = "myenv")
myenv.docker.enabled = True
dockerfile = r"""
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04
RUN apt-get update && apt-get install -y libgl1-mesa-glx 
RUN echo "Hello from custom container!"
"""
myenv.docker.base_image = None
myenv.docker.base_dockerfile = dockerfile

!wget https://github.com/parasailteam/sccl-presynth


In [6]:
#pytorch_env = Environment.get(ws, name='AzureML-PyTorch-1.6-GPU')
#pytorch_env = Environment.from_conda_specification(name='AzureML-PyTorch-1.6-GPU',file_path='distributed-pytorch-with-distributeddataparallel.yml')
# curated_env_name = 'AzureML-PyTorch-1.6-GPU'
curated_env_name = 'AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu'
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
# curated_env_name = 'PTE-BIC'
#pytorch_env = Environment(name = "myenv")
pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384} 

dino_env = pytorch_env.clone("dino_env")

conda = CondaDependencies()

# # add conda packages
#conda.add_conda_package('python=3.6')
#conda.add_conda_package('pytorch=1.7.1')
#conda.add_conda_package('torchvision=0.8.2')

# # add pip packages
# # conda.add_pip_package('submitit')
#conda.add_pip_package(git+https://github.com/microsoft/sccl.git)

# # add pip packages
conda.add_pip_package('timm')
# # create environment
# #env = Environment('curated_clone')
dino_env.python.conda_dependencies = conda
docker_config = DockerConfiguration(use_docker=True,shm_size='256g')




In [8]:
patch_size = 16
batch_size_per_gpu = 128
epochs = 10
node_count = 2
process_count = 16
communication_backend = 'NCCL'

In [9]:
output_folder = 'Output_node'+str(node_count)+'_'+'gpus'+str(process_count)+'_bacthsize'+str(batch_size_per_gpu)

output= OutputFileDatasetConfig(destination=(datastore, output_folder))

In [10]:
# create distributed config
distr_config = PyTorchConfiguration(communication_backend=communication_backend,process_count=process_count, node_count=node_count)
# create args
args = ["--arch", "vit_small",
        "--data_path", dataset.as_download(), 
        "--patch_size", patch_size,
        "--norm_last_layer",False, 
        "--warmup_teacher_temp", 0.04, 
        "--teacher_temp", 0.07, 
        "--use_fp16", False, 
        "--weight_decay", 0.04,
        "--weight_decay_end", 0.4, 
        "--clip_grad", 0, 
        "--batch_size_per_gpu", batch_size_per_gpu, 
        "--epochs", epochs, 
        "--freeze_last_layer", 1, 
        "--lr", 0.0005, 
        "--warmup_epochs", 2, 
        "--min_lr", 1e-05, 
        "--local_crops_number", 10, 
        "--seed", 0, 
        "--num_workers", 10,
        "--optimizer", 'adamw', 
        "--momentum_teacher", 0.996,
        "--use_bn_in_head", False, 
        "--drop_path_rate", 0.1,
        "--global_crops_scale", 0.25 , 1.0,
        "--local_crops_scale",0.05, 0.25,
        "--saveckp_freq",4, 
        "--output_dir", output.as_mount()]

print(args)

['--arch', 'vit_small', '--data_path', <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x7fa9e16f7f70>, '--patch_size', 16, '--norm_last_layer', False, '--warmup_teacher_temp', 0.04, '--teacher_temp', 0.07, '--use_fp16', False, '--weight_decay', 0.04, '--weight_decay_end', 0.4, '--clip_grad', 0, '--batch_size_per_gpu', 128, '--epochs', 10, '--freeze_last_layer', 1, '--lr', 0.0005, '--warmup_epochs', 2, '--min_lr', 1e-05, '--local_crops_number', 10, '--seed', 0, '--num_workers', 10, '--optimizer', 'adamw', '--momentum_teacher', 0.996, '--use_bn_in_head', False, '--drop_path_rate', 0.1, '--global_crops_scale', 0.25, 1.0, '--local_crops_scale', 0.05, 0.25, '--saveckp_freq', 4, '--output_dir', <azureml.data.output_dataset_config.OutputFileDatasetConfig object at 0x7fa9d683e4f0>]


In [11]:
src = ScriptRunConfig(source_directory=project_folder,                    
                      script='main_dino.py',
                       arguments=args,
                       compute_target=compute_target,
                       environment=dino_env,
                       distributed_job_config=distr_config,
                       docker_runtime_config=docker_config)



run = experiment.submit(src)
run.add_properties({"author":"Afs", 
                    "storage" : "premium",
                    "envoirnment" : curated_env_name,
                    "dataset": "download"
                    })
print(run)
RunDetails(run).show()

In [12]:
run = experiment.submit(src)
run.tag("author","AFS")
run.tag("storage" , "premium")
run.tag("envoirnment" ,'dino_env' )
run.tag("dataset", "download")
run.tag("batch_size_per_gpu" , str(batch_size_per_gpu))
run.tag("patch_size" , str(patch_size))
run.tag("epochs" , str(epochs))
run.tag("ENV" , str(curated_env_name))
run.tag("communication_backend" , str(communication_backend))
run.tag("gpus" , str(process_count))
run.tag("nodes" , str(node_count))
print(run)
RunDetails(run).show()

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/aghasemi1/code/Users/aghasemi/dino directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


Run(Experiment: exp-A100,
Id: exp-A100_1645535544_811a6c9e,
Type: azureml.scriptrun,
Status: Queued)


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…