Skip to content

Commit

Permalink
Merge pull request #1599 from grondo/cuda_visible_devices
Browse files Browse the repository at this point in the history
wreck: set CUDA_VISIBLE_DEVICES when gpus are in R_lite
  • Loading branch information
garlick committed Jul 25, 2018
2 parents 55d47d5 + 6eda6db commit c91ac60
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/bindings/lua/wreck.lua
Expand Up @@ -38,6 +38,7 @@ local lwj_options = {
['no-pmi-server'] = "Do not start simple-pmi server",
['trace-pmi-server'] = "Log simple-pmi server protocol exchange",
['cpu-affinity'] = "Set default cpu-affinity to assigned cores",
['gpubind'] = "Control CUDA_VISIBLE_DEVICES [=per-task,off]",
['mpi'] = "Set hint for type of MPI, e.g. -o mpi=spectrum "
}
Expand Down
7 changes: 6 additions & 1 deletion src/cmd/flux-wreckrun
Expand Up @@ -91,12 +91,17 @@ local function alloc_tasks (f, wreck, lwj)
end
end
for i, ntasks in ipairs (counts) do
local gpus_per_task = tonumber (wreck.opts.g or 0)
local rank = i - 1
local corelist = "0"
local gpulist = nil
if gpus_per_task > 0 then
gpulist = "0-".. (ntasks * gpus_per_task) - 1
end
if ntasks > 1 then
corelist = corelist .. "-" .. ntasks - 1
end
table.insert (Rlite, { rank = rank, children = { core = corelist } })
table.insert (Rlite, { rank = rank, children = { core = corelist, gpu = gpulist } })
if not r[ntasks] then r[ntasks] = {} end
table.insert (r[ntasks], rank)
end
Expand Down
5 changes: 3 additions & 2 deletions src/modules/wreck/Makefile.am
Expand Up @@ -84,8 +84,9 @@ dist_wreckscripts_SCRIPTS = \
lua.d/mvapich.lua \
lua.d/pmi-mapping.lua \
lua.d/intel_mpi.lua \
lua.d/openmpi.lua \
lua.d/spectrum.lua
lua.d/openmpi.lua \
lua.d/spectrum.lua \
lua.d/cuda_devices.lua

# XXX: Hack below to force rebuild of unbuilt wrexecd dependencies
#
Expand Down
75 changes: 75 additions & 0 deletions src/modules/wreck/lua.d/cuda_devices.lua
@@ -0,0 +1,75 @@
local gpubind = wreck:getopt ("gpubind")
if gpubind == "no" or gpubind == "off" then
return
end

-- Set CUDA_VISIBLE_DEVICES for all tasks on any rank with one or
-- more "gpu" resources

local gpuinfo = {}
function gpuinfo_create (wreck, gpus)
local g = {}
-- Use affinity.cpuset as a convenience to parse the GPU list, which
-- is in nodeset form (e.g. "0-1" or "0,2-5", etc.)
--
local gset, err = require 'flux.affinity'.cpuset.new (gpus)
if not gset then
wreck:log_error ("Unable to parse GPU list [%s]: %s", gpus, err)
return nil
end
local g = {
gpuids = gset:expand (),
ngpus = gset:count (),
ntasks = wreck.tasks_per_node [wreck.nodeid]
}

-- If per-task binding is requested, ensure ngpus is evenly divisible
-- into ntasks:
if gpubind == "per-task" and g.ngpus % g.ntasks == 0 then
g.ngpus_per_task = g.ngpus/g.ntasks
end
return g
end

function rexecd_init ()
-- NB: Lua arrays are indexed starting at 1, so this rank's index
-- into R_lite rank array is nodeid + 1:
--
local index = wreck.nodeid + 1

-- Grab local resources structure from kvs for this nodeid:
--
local Rlocal = wreck.kvsdir.R_lite[index].children

-- If a gpu resource list is set for this rank, then expand it and
-- set CUDA_VISIBLE_DEVICES to the result:
--
local gpus = Rlocal.gpu
if not gpus then return end

gpuinfo = gpuinfo_create (wreck, gpus)
-- If ngpus_per_task is not set, then set CUDA_VISIBLE_DEVICES the same
-- for all tasks:
if not gpuinfo.ngpus_per_task then
local ids = table.concat (gpuinfo.gpuids, ",")
wreck.environ ["CUDA_VISIBLE_DEVICES"] = ids
end
-- Always set CUDA_DEVICE_ORDER=PCI_BUS_ID to ensure CUDA ids match
-- IDs known to flux scheduler.
wreck.environ ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
end

function rexecd_task_init ()
-- If ngpus_per_task is set, then select that many GPUs from the gpuids
-- list assigned to this rank for the current task:
if not gpuinfo.ngpus_per_task then return end

local basis = gpuinfo.ngpus_per_task * wreck.taskid
local t = {}
for i = 1,gpuinfo.ngpus_per_task do
table.insert (t, gpuinfo.gpuids [basis + i])
end
wreck.environ ["CUDA_VISIBLE_DEVICES"] = table.concat (t, ",")
end

-- vi: ts=4 sw=4 expandtab
28 changes: 28 additions & 0 deletions t/t2000-wreck.t
Expand Up @@ -265,6 +265,34 @@ test_expect_success MULTICORE 'wreckrun: local cpu-affinity option overriedes gl
test_cmp no-affinity.expected no-affinity.out
'

test_expect_success 'wreckrun: CUDA_VISIBLE_DEVICES is set for gpus' '
output=$(flux wreckrun -n1 -g 1 printenv CUDA_VISIBLE_DEVICES) &&
test_debug "echo CUDA_VISIBLE_DEVICES = $output" &&
test "$output" = "0" &&
output=$(flux wreckrun -n1 -g 2 printenv CUDA_VISIBLE_DEVICES) &&
test_debug "echo CUDA_VISIBLE_DEVICES = $output" &&
test "$output" = "0,1"
'
test_expect_success 'wreckrun: CUDA_VISIBLE_DEVICES not set with gpubind=off' '
output=$(flux wreckrun -n1 -g1 -o gpubind=off printenv CUDA_VISIBLE_DEVICES ||:) &&
test_debug "echo CUDA_VISIBLE_DEVICES=$output" &&
test "$output" = ""
'
test_expect_success 'wreckrun: -o gpubind=per-task works' '
flux wreckrun -l -n2 -N1 -g2 printenv CUDA_VISIBLE_DEVICES |sort >cuda_visible.out1 &&
cat <<-EOF >cuda_visible.expected1 &&
0: 0,1,2,3
1: 0,1,2,3
EOF
test_cmp cuda_visible.expected1 cuda_visible.out1 &&
flux wreckrun -l -n2 -N1 -g2 -o gpubind=per-task \
printenv CUDA_VISIBLE_DEVICES |sort >cuda_visible.out2 &&
cat <<-EOF >cuda_visible.expected2 &&
0: 0,1
1: 2,3
EOF
test_cmp cuda_visible.expected2 cuda_visible.out2
'
test_expect_success 'wreckrun: top level environment' '
flux kvs put --json lwj.environ="{ \"TEST_ENV_VAR\": \"foo\" }" &&
run_timeout 5 flux wreckrun -n2 printenv TEST_ENV_VAR > output_top_env &&
Expand Down

0 comments on commit c91ac60

Please sign in to comment.