In [1]:
import os 

import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Experiment, Environment, ScriptRunConfig

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.core.runconfig import PyTorchConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

from azureml.data import OutputFileDatasetConfig
from azureml.telemetry import set_diagnostics_collection
from azureml.widgets import RunDetails

set_diagnostics_collection(send_diagnostics=True)

print("SDK version:", azureml.core.VERSION)

Turning diagnostics collection on. 
SDK version: 1.38.0


In [2]:
azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000

In [3]:
project_folder = '../dino'
ws = Workspace.from_config()
datastore = ws.get_default_datastore()
image_net_dataset = Dataset.get_by_name(ws, 'imagenet_2015_premium_west_full')

In [4]:
# choose a name for your cluster
cluster_name = 'A100-2'
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Cannot Find the compute cluster')

# use get_status() to get a detailed status for the current AmlCompute. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2022-03-08T01:33:47.270000+00:00', 'errors': None, 'creationTime': '2022-03-06T10:48:17.444629+00:00', 'modifiedTime': '2022-03-08T01:13:47.335722+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT30S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_ND96AMSR_A100_V4'}


In [13]:
# curated_env_name = 'AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu'
# pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
# pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384} 

# dino_env = pytorch_env.clone("dino_env")

# conda = CondaDependencies()

# # # add pip packages
# conda.add_pip_package('timm')
# # # create environment
# dino_env.python.conda_dependencies = conda
# docker_config = DockerConfiguration(use_docker=True, arguments = ['--ipc=host'], shm_size='1.2T')




In [16]:
# curated_env_name = 'PTEBIC-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu'
# pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
# pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384, "NCCL_DEBUG":"INFO", "NCCL_DEBUG_SUBSYS":"ALL", "NCCL_ALGO":"Tree,Collnet"} 

# dino_env = pytorch_env.clone("dino_env")
# conda = CondaDependencies()
# dino_env.python.conda_dependencies = conda

env = Environment("sea-dockerfile")
env.docker.base_image = "ptebic.azurecr.io/internal/azureml/aifx/stable-ubuntu2004-cu113-py38-torch1101:latest"
env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384, "NCCL_DEBUG":"INFO", "NCCL_DEBUG_SUBSYS":"ALL", "NCCL_TOPO_FILE": "/var/run/nvidia-topologyd/A100/virtualTopology.xml"} 
env.python.user_managed_dependencies = True

docker_config = DockerConfiguration(use_docker=True, arguments = ['--ipc=host'], shm_size='256g')



In [15]:
patch_size = 16
batch_size_per_gpu = 64
node_count = 1
process_count = 8
communication_backend = 'NCCL'

In [16]:
#datastore = ws.get_default_datastore()
#out_dataset = Dataset.get_by_name(ws,name='Output_node1_ViTs_gpus8_bacthsize64')


# Inference - Linear

In [17]:
from azureml.core import Experiment
experiment_name = 'exp_Inference_ViTS16'
Inference_experiment = Experiment(ws, name=experiment_name)
# create distributed config
distr_Inference_config = PyTorchConfiguration(node_count=node_count)
#launch_cmd = ["python -m torch.distributed.launch --nproc_per_node=8 eval_linear.py --pretrained_weights",'./checkpoints' , "--checkpoint_key teacher --data_path", image_net_dataset.as_download(),"--patch_size 16 --batch_size_per_gpu 64"]
launch_cmd = ["torchrun --nproc_per_node=8 eval_linear.py --pretrained_weights",'./checkpoints' , "--checkpoint_key teacher --data_path", image_net_dataset.as_download(),"--patch_size 16 --batch_size_per_gpu 64"]

src_config_linear = ScriptRunConfig(
  source_directory=project_folder,
  command=launch_cmd,
  compute_target=compute_target,
  environment=env,
  distributed_job_config=distr_Inference_config
)
runInferenceLinear = Inference_experiment.submit(src_config_linear)

Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/ds3-4cores/code/Users/aghasemi/dino directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


In [18]:
runInferenceLinear.tag("author","AFS")
runInferenceLinear.tag("storage" , "premium")
runInferenceLinear.tag("envoirnment" ,'dino_env' )
runInferenceLinear.tag("dataset", "download")
runInferenceLinear.tag("batch_size_per_gpu" , str(batch_size_per_gpu))
runInferenceLinear.tag("patch_size" , str(patch_size))
runInferenceLinear.tag("epochs" , "100")
runInferenceLinear.tag("ENV" , str(curated_env_name))
runInferenceLinear.tag("gpus" , str(process_count))
runInferenceLinear.tag("nodes" , str(node_count))


NameError: name 'curated_env_name' is not defined

In [None]:
RunDetails(runInferenceLinear).show()

# Visualise attention

In [None]:
# %run visualize_attention --pretrained_weights 'checkpoint.pth' --checkpoint_key 'teacher' --image_path 'img.png' --patch_size 16

In [None]:
# import matplotlib.pyplot as plt
# fig = plt.figure(figsize=(15, 5))
# fig.add_subplot(1, 1, 1)
# img = plt.imread('img.png')
# plt.imshow(img)
# fig.add_subplot(1, 2, 1)
# attn = plt.imread('attn-head0.png')
# plt.imshow(attn)
# plt.show()