In [112]:
import os 

import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Experiment, Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
#from azureml.core.runconfig import PyTorchConfiguration
from azureml.core.runconfig import PyTorchConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

from azureml.data import OutputFileDatasetConfig
from azureml.telemetry import set_diagnostics_collection

from azureml.widgets import RunDetails

set_diagnostics_collection(send_diagnostics=True)

print("SDK version:", azureml.core.VERSION)

Turning diagnostics collection on. 
SDK version: 1.38.0


In [113]:
azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 6000000000

In [114]:
project_folder = '../dino-large-ori'
#os.makedirs(project_folder, exist_ok=True)

In [115]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()
dataset = Dataset.get_by_name(ws, name='imagenet_2015_premium_west_europe')


In [116]:
experiment_name = 'dino-A100'
experiment = Experiment(ws, name=experiment_name)

myenv = Environment(name = "myenv")
myenv.docker.enabled = True
dockerfile = r"""
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04
RUN apt-get update && apt-get install -y libgl1-mesa-glx 
RUN echo "Hello from custom container!"
"""
myenv.docker.base_image = None
myenv.docker.base_dockerfile = dockerfile

!wget https://github.com/parasailteam/sccl-presynth


from azureml.core import Environment

#pytorch_env = Environment.from_conda_specification(name='AzureML-PyTorch-1.6-GPU',file_path='distributed-pytorch-with-distributeddataparallel.yml')
curated_env_name = 'AzureML-pytorch-1.7-ubuntu18.04-py37-cuda11-gpu'
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384} 

dino_env = pytorch_env.clone("dino_env")

#conda = CondaDependencies()

# # create environment
#dino_env.python.conda_dependencies = conda
docker_config = DockerConfiguration(use_docker=True, arguments = ['--ipc=host'], shm_size='256g')


curated_env_name = 'AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu'
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384} 

dino_env = pytorch_env.clone("dino_env")

env = Environment("sea-dockerfile")
env.docker.base_image = "ptebic.azurecr.io/internal/azureml/aifx/stable-ubuntu2004-cu113-py38-torch1102:latest"
env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384, "NCCL_TOPO_FILE": "/var/run/nvidia-topologyd/A100/virtualTopology.xml", "NCCL_SOCKET_IFNAME": "eth0","NCCL_IB_GID_INDEX":3,"NCCL_IB_TIMEOUT":20} 
env.python.user_managed_dependencies = True

docker_config = DockerConfiguration(use_docker=True, arguments = ['--ipc=host'] , shm_size='256g')



In [117]:
env = Environment("sea-dockerfile")
env.docker.base_image = "ptebic.azurecr.io/test/public/azureml/aifx/stable-ubuntu2004-cu113-py38-torch1110:20220328.v1"
env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384, "NCCL_TOPO_FILE": "/var/run/nvidia-topologyd/A100/virtualTopology.xml"} 

env.python.user_managed_dependencies = True

docker_config = DockerConfiguration(use_docker=True, arguments = ['--ipc=host'])




In [118]:
patch_size = 16
batch_size_per_gpu = 20
epochs = 3
node_count = 4
process_count = 8 * node_count
communication_backend = 'NCCL'
optimizer = 'adamw'

In [119]:
import uuid

unique_string = str(uuid.uuid1())

In [120]:
output_folder = 'Output_node3'+str(node_count)+'_'+'ViTL_gpus'+str(process_count)+'_bacthsize'+str(batch_size_per_gpu)+'_epochs'+str(epochs)+'_uuid_'+unique_string

output= OutputFileDatasetConfig(destination=(datastore, output_folder))

In [121]:
output_folder

'Output_node34_ViTL_gpus32_bacthsize20_epochs3_uuid_6eeb5f9c-c6b3-11ec-a34a-b7bf3accdb58'

In [122]:
# create distributed config
distr_config = PyTorchConfiguration(communication_backend=communication_backend,process_count=process_count, node_count=node_count)
# create args
args = ["--arch", "vit_large",
        "--data_path", dataset.as_mount(), 
        "--patch_size", patch_size,
        "--norm_last_layer",True, 
        "--warmup_teacher_temp", 0.04, 
        "--teacher_temp", 0.04, 
        "--use_fp16", False, 
        "--weight_decay", 0.04,
        "--weight_decay_end", 0.08, 
        "--clip_grad", 0, 
        "--batch_size_per_gpu", batch_size_per_gpu, 
        "--epochs", epochs, 
        "--freeze_last_layer", 3, 
        "--lr", 0.0005, 
        "--warmup_epochs", 1, 
        "--warmup_teacher_temp_epochs",1,
        "--min_lr", 0.0001, 
        "--local_crops_number", 10, 
        "--seed", 0, 
        "--num_workers", 10,
        "--optimizer", optimizer, 
        "--momentum_teacher", 0.996,
        "--use_bn_in_head", False, 
        "--out_dim", 65536,
        "--drop_path_rate", 0.3,
        "--global_crops_scale", 0.25 , 1.0,
        "--local_crops_scale",0.05, 0.25,
        "--saveckp_freq",10, 
        "--output_dir", output.as_mount()]

        #ref for 2 nodes from facebook git:{"arch": "vit_small", "patch_size": 16, "out_dim": 65536,
        # "norm_last_layer": false, "warmup_teacher_temp": 0.04, "teacher_temp": 0.07,
        # "warmup_teacher_temp_epochs": 30, "use_fp16": false, "weight_decay": 0.04, "weight_decay_end": 0.4, 
        #"clip_grad": 0, "batch_size_per_gpu": 64, "epochs": 800, "freeze_last_layer": 1, "lr": 0.0005,
        # "warmup_epochs": 10, "min_lr": 1e-05, "global_crops_scale": [0.25, 1.0],
        # "local_crops_scale": [0.05, 0.25], "local_crops_number": 10, "seed": 0, "num_workers": 10,
        # "world_size": 16, "ngpus": 8, "nodes": 2, "optimizer": "adamw", "momentum_teacher": 0.996, "use_bn_in_head": false, "drop_path_rate": 0.1}

        ## ref one node: command: ["/bin/sh", "-c", "cd /code; \ python -m torch.distributed.launch --nproc_per_node=8 main_dino.py \ --data_path /dataset/imagenet-raw/train --output_dir ./exp_dino/ \ --freeze_last_layer 1
        ## --lr 0.0005 --weight_decay 0.04 --batch_size_per_gpu 64 \ --drop_path_rate 0.1 --epochs 100 --warmup_epochs 10 \ 
        ##--out_dim 65536 --norm_last_layer false --use_bn_in_head false \ --teacher_temp 0.04 --warmup_teacher_temp 0.04
        ## --warmup_teacher_temp_epochs 0 \ --use_fp16 false --clip_grad 3.0 \ --momentum_teacher 0.996
        ## \ --global_crops_scale 0.25 1 --local_crops_scale 0.05 0.25 \ --local_crops_number 6 "]

print(args)

['--arch', 'vit_large', '--data_path', <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x7f60f7bf5df0>, '--patch_size', 16, '--norm_last_layer', True, '--warmup_teacher_temp', 0.04, '--teacher_temp', 0.04, '--use_fp16', False, '--weight_decay', 0.04, '--weight_decay_end', 0.08, '--clip_grad', 0, '--batch_size_per_gpu', 20, '--epochs', 3, '--freeze_last_layer', 3, '--lr', 0.0005, '--warmup_epochs', 1, '--warmup_teacher_temp_epochs', 1, '--min_lr', 0.0001, '--local_crops_number', 10, '--seed', 0, '--num_workers', 10, '--optimizer', 'adamw', '--momentum_teacher', 0.996, '--use_bn_in_head', False, '--out_dim', 65536, '--drop_path_rate', 0.3, '--global_crops_scale', 0.25, 1.0, '--local_crops_scale', 0.05, 0.25, '--saveckp_freq', 10, '--output_dir', <azureml.data.output_dataset_config.OutputFileDatasetConfig object at 0x7f60f7b96d30>]


In [123]:
src = ScriptRunConfig(source_directory=project_folder,                    
                      script='main_dino.py',
                       arguments=args,
                       compute_target='A100-2',
                       environment=env ,
                       distributed_job_config=distr_config,
                       docker_runtime_config=docker_config
                    )


In [124]:
run = experiment.submit(src)

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/frdong2/code/Users/frdong/dino-large-ori directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


In [125]:

run.tag("author","frdong")
run.tag("storage" , "premium")
run.tag("envoirnment" ,'ENV with Adamw Optimizer' )
run.tag("dataset", "download")
run.tag("batch_size_per_gpu" , str(batch_size_per_gpu))
run.tag("patch_size" , str(patch_size))
run.tag("epochs" , str(epochs))
run.tag("communication_backend" , str(communication_backend))
run.tag("gpus" , str(process_count))
run.tag("nodes" , str(node_count))
run.tag("comments","4 nodes 2 epochs, 20 per gpus with fused-adam")


In [126]:
print(run)
#RunDetails(run).show()

Run(Experiment: dino-A100,
Id: dino-A100_1651123450_1a0f76af,
Type: azureml.scriptrun,
Status: Queued)
