Skip to content

Commit

Permalink
fix owner pod and slave pod must in same namespace in issue pokerface…
Browse files Browse the repository at this point in the history
…Sad#19 (comment)

* add environment variable `GPU_POOL_NAMESPACE`(not have default value, must set this env var) to set slave pod namespace on create on worker
  • Loading branch information
cool9203 committed Feb 21, 2022
1 parent 163ef7b commit 5ca4e5c
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 16 deletions.
4 changes: 3 additions & 1 deletion deploy/gpu-mounter-workers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ spec:
- name: CGROUP_DRIVER
value: "cgroupfs"
# value: "systemd"
- name: GPU_POOL_NAMESPACE
value: "default"
volumeMounts:
- name: cgroup
mountPath: /sys/fs/cgroup
Expand All @@ -49,4 +51,4 @@ spec:
- name: log-dir
hostPath:
type: DirectoryOrCreate
path: /etc/GPUMounter/log
path: /etc/GPUMounter/log
3 changes: 2 additions & 1 deletion pkg/server/gpu-mount/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
. "GPUMounter/pkg/util/log"
"context"
"errors"
"os"

k8s_error "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -84,7 +85,7 @@ func (gpuMountImpl GPUMountImpl) AddGPU(_ context.Context, request *gpu_mount.Ad
Logger.Error("Mount GPU: " + targetGPU.String() + " to Pod: " + request.PodName + " in Namespace: " + request.Namespace + " failed")
Logger.Error(err)
for _, freeGPU := range gpuResources {
err = clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Delete(context.TODO(), freeGPU.PodName, *metav1.NewDeleteOptions(0))
err = clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Delete(context.TODO(), freeGPU.PodName, *metav1.NewDeleteOptions(0))
if err != nil {
Logger.Error("Failed to release GPU: ", freeGPU.String())
}
Expand Down
21 changes: 11 additions & 10 deletions pkg/util/gpu/allocator/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"fmt"
"strconv"
"strings"
"os"

corev1 "k8s.io/api/core/v1"
k8s_errors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -64,30 +65,30 @@ func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, totalGpu
switch <-ch {
case gpu.InsufficientGPU:
for _, slavePodName := range slavePodNames {
err = clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Delete(context.TODO(), slavePodName, *metav1.NewDeleteOptions(0))
err = clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Delete(context.TODO(), slavePodName, *metav1.NewDeleteOptions(0))
if err != nil {
Logger.Error(err)
Logger.Error("Failed to recycle slave pod: ", slavePodName, " Namespace: ", gpu.GPUPoolNamespace)
Logger.Error("Failed to recycle slave pod: ", slavePodName, " Namespace: ", os.Getenv("GPU_POOL_NAMESPACE"))
}
}
return nil, errors.New(gpu.InsufficientGPU)
case gpu.FailedCreated:
for _, slavePodName := range slavePodNames {
err = clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Delete(context.TODO(), slavePodName, *metav1.NewDeleteOptions(0))
err = clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Delete(context.TODO(), slavePodName, *metav1.NewDeleteOptions(0))
if err != nil {
Logger.Error(err)
Logger.Error("Failed to recycle slave pod: ", slavePodName, " Namespace: ", gpu.GPUPoolNamespace)
Logger.Error("Failed to recycle slave pod: ", slavePodName, " Namespace: ", os.Getenv("GPU_POOL_NAMESPACE"))
}
}
return nil, errors.New(gpu.FailedCreated)
case gpu.SuccessfullyCreated:
Logger.Infof("Successfully create Slave Pod: %s, for Owner Pod: %s ", strings.Join(slavePodNames, ", "), ownerPod.Name)
var availableGPUResource []*device.NvidiaGPU
for _, slavePodName := range slavePodNames {
gpuResources, err := gpuAllocator.GetPodGPUResources(slavePodName, gpu.GPUPoolNamespace)
gpuResources, err := gpuAllocator.GetPodGPUResources(slavePodName, os.Getenv("GPU_POOL_NAMESPACE"))
if err != nil {
Logger.Error(err)
Logger.Error("Failed to get gpu resource for Slave Pod: ", slavePodName, " in Namespace: ", gpu.GPUPoolNamespace)
Logger.Error("Failed to get gpu resource for Slave Pod: ", slavePodName, " in Namespace: ", os.Getenv("GPU_POOL_NAMESPACE"))
return nil, errors.New(gpu.FailedCreated)
}
availableGPUResource = append(availableGPUResource, gpuResources...)
Expand Down Expand Up @@ -133,7 +134,7 @@ func (gpuAllocator *GPUAllocator) DeleteSlavePods(slavePodNames []string) error
return err
}
for _, slavePodName := range slavePodNames {
err = clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Delete(context.TODO(), slavePodName, metav1.DeleteOptions{})
err = clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Delete(context.TODO(), slavePodName, metav1.DeleteOptions{})
if err != nil {
Logger.Error("Failed to delete Slave Pod: ", slavePodName)
return err
Expand Down Expand Up @@ -195,7 +196,7 @@ func newGPUSlavePod(ownerPod *corev1.Pod, gpuNum int) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: ownerPod.Name + "-slave-pod-" + randID,
Namespace: gpu.GPUPoolNamespace,
Namespace: os.Getenv("GPU_POOL_NAMESPACE"),
Labels: map[string]string{
"app": "gpu-pool",
},
Expand Down Expand Up @@ -246,7 +247,7 @@ func checkCreateState(podNames []string, ch chan string) {
for {
flag := true
for _, slavePodName := range podNames {
pod, err := clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Get(context.TODO(), slavePodName, metav1.GetOptions{})
pod, err := clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Get(context.TODO(), slavePodName, metav1.GetOptions{})
if err != nil {
if k8s_errors.IsNotFound(err) {
Logger.Info("Not Found....")
Expand Down Expand Up @@ -295,7 +296,7 @@ func checkDeleteState(podNames []string, ch chan string) {
for {
flag := true
for _, slavePodName := range podNames {
_, err := clientset.CoreV1().Pods(gpu.GPUPoolNamespace).Get(context.TODO(), slavePodName, metav1.GetOptions{})
_, err := clientset.CoreV1().Pods(os.Getenv("GPU_POOL_NAMESPACE")).Get(context.TODO(), slavePodName, metav1.GetOptions{})
if err != nil {
if k8s_errors.IsNotFound(err) {
// this slavePod has been deleted
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/gpu/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (gpuCollector *GPUCollector) GetPodGPUResources(podName string, namespace s
var gpuResources []*device.NvidiaGPU
for _, gpuDev := range gpuCollector.GPUList {
if (gpuDev.PodName == podName && gpuDev.Namespace == namespace) ||
(strings.Contains(gpuDev.PodName, podName+"-slave-pod-") && gpuDev.Namespace == gpu.GPUPoolNamespace) {
(strings.Contains(gpuDev.PodName, podName+"-slave-pod-") && gpuDev.Namespace == os.Getenv("GPU_POOL_NAMESPACE")) {
gpuResources = append(gpuResources, gpuDev)
}
}
Expand Down
4 changes: 1 addition & 3 deletions pkg/util/gpu/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ const (
FailedCreated = "FailedCreated"
SuccessfullyDeleted = "SuccessfullyDeleted"
FailedDeleted = "FailedDeleted"

GPUPoolNamespace = "gpu-pool"
)

type MountType string
Expand All @@ -25,4 +23,4 @@ const (
SingleMount MountType = "single-mount"
NoMount MountType = "no-mount"
UnknownMount MountType = "unknown-mount"
)
)

0 comments on commit 5ca4e5c

Please sign in to comment.