Skip to content

Commit

Permalink
chore: dispatcher RM supports slot type ROCM (determined-ai#329)
Browse files Browse the repository at this point in the history
* chore: dispatcher RM supports slot type ROCM
  • Loading branch information
phillip-gaisford authored and eecsliu committed Sep 2, 2022
1 parent cdfdaba commit b8a088a
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 15 deletions.
38 changes: 23 additions & 15 deletions master/pkg/tasks/dispatcher_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ import (
)

const (
trueValue = "true"
falseValue = "false"
trueValue = "true"
// dispatcherEntrypointScriptResource is the script to handle container initialization
// before transferring to the defined entrypoint script.
dispatcherEntrypointScriptResource = "dispatcher-wrapper.sh"
Expand Down Expand Up @@ -130,19 +129,8 @@ func (t *TaskSpec) ToDispatcherManifest(
workDir = "/var/tmp"
}

enableNvidia := falseValue
if slotType == device.CUDA {
enableNvidia = trueValue
}

launchParameters.SetConfiguration(map[string]string{
"workingDir": workDir,
"enableNvidia": enableNvidia, // triggers 'singularity run --nv ...'
"enableWritableTmpFs": trueValue, // Make container filesystem writable (for links in /)
})
if slurmPartition != "" {
launchParameters.GetConfiguration()["partition"] = slurmPartition
}
launchConfig := t.computeLaunchConfig(slotType, workDir, slurmPartition)
launchParameters.SetConfiguration(*launchConfig)

// Determined generates tar archives including initialization, garbage collection,
// and security configuration and then maps them into generic containers when
Expand Down Expand Up @@ -237,6 +225,26 @@ func getAllArchives(t *TaskSpec) *[]cproto.RunArchive {
return &allArchives
}

// computeLaunchConfig computes the launch configuration for the Slurm job manifest.
func (t *TaskSpec) computeLaunchConfig(
slotType device.Type, workDir string,
slurmPartition string) *map[string]string {
launchConfig := map[string]string{
"workingDir": workDir,
"enableWritableTmpFs": trueValue,
}
if slurmPartition != "" {
launchConfig["partition"] = slurmPartition
}
if slotType == device.CUDA {
launchConfig["enableNvidia"] = trueValue
}
if slotType == device.ROCM {
launchConfig["enableROCM"] = trueValue
}
return &launchConfig
}

// Return true if the archive specified should be treated
// as per-process and not a shared volume for all processes.
// Unless configured in this list, all items are shared. It
Expand Down
76 changes: 76 additions & 0 deletions master/pkg/tasks/dispatcher_task_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package tasks

import (
"reflect"
"testing"

"github.com/determined-ai/determined/master/pkg/device"
)

const workDir = "/workdir"

func TestTaskSpec_computeLaunchConfig(t *testing.T) {
type args struct {
slotType device.Type
workDir string
slurmPartition string
}
tests := []struct {
name string
args args
want *map[string]string
}{
{
name: "Dispatcher is notified that CUDA support required",
args: args{
slotType: device.CUDA,
workDir: workDir,
slurmPartition: "partitionName",
},
want: &map[string]string{
"workingDir": workDir,
"enableNvidia": trueValue,
"enableWritableTmpFs": trueValue,
"partition": "partitionName",
},
},
{
name: "Dispatcher is notified that ROCM support required",
args: args{
slotType: device.ROCM,
workDir: workDir,
slurmPartition: "partitionName",
},
want: &map[string]string{
"workingDir": workDir,
"enableROCM": trueValue,
"enableWritableTmpFs": trueValue,
"partition": "partitionName",
},
},
{
name: "Verify behavior when no partition specified",
args: args{
slotType: device.CUDA,
workDir: workDir,
slurmPartition: "",
},
want: &map[string]string{
"workingDir": workDir,
"enableNvidia": trueValue,
"enableWritableTmpFs": trueValue,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tr := &TaskSpec{}
if got := tr.computeLaunchConfig(
tt.args.slotType,
tt.args.workDir,
tt.args.slurmPartition); !reflect.DeepEqual(got, tt.want) {
t.Errorf("TaskSpec.computeLaunchConfig() = %v, want %v", got, tt.want)
}
})
}
}

0 comments on commit b8a088a

Please sign in to comment.