Skip to content

Commit

Permalink
Run nvidia-smi directly on worker host to respect GPU isolation (#2415)
Browse files Browse the repository at this point in the history
  • Loading branch information
nelson-liu committed Jul 1, 2020
1 parent 4017f05 commit 0746961
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions codalab/worker/main.py
Expand Up @@ -10,6 +10,7 @@
import signal
import socket
import stat
import subprocess
import sys
import psutil

Expand Down Expand Up @@ -306,8 +307,21 @@ def parse_gpuset_args(arg):
return set()

try:
all_gpus = docker_utils.get_nvidia_devices() # Dict[GPU index: GPU UUID]
except docker_utils.DockerException:
# We run nvidia-smi on the host directly, in order to respect
# environment variables like CUDA_VISIBLE_DEVICES or other restrictions
# that, for instance, might be placed by Slurm or a similar resource
# allocation system. Running nvidia-smi in Docker ignores these
# restrictions, hence why we don't just simply use
# docker_utils.get_nvidia_devices()
nvidia_command = ['nvidia-smi', '--query-gpu=index,uuid', '--format=csv,noheader']
output = subprocess.run(
nvidia_command, stdout=subprocess.PIPE, check=True, universal_newlines=True
).stdout
print(output.split('\n')[:-1])
all_gpus = {
gpu.split(',')[0].strip(): gpu.split(',')[1].strip() for gpu in output.split('\n')[:-1]
}
except (subprocess.CalledProcessError, FileNotFoundError):
all_gpus = {}

if arg == 'ALL':
Expand Down

0 comments on commit 0746961

Please sign in to comment.