-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob_info.go
149 lines (128 loc) · 3.46 KB
/
job_info.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package commands
import (
log "github.com/sirupsen/logrus"
batchv1 "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"time"
)
type JobInfo struct {
job batchv1.Job
name string
pods []v1.Pod // all the pods including statefulset and job
jobPod v1.Pod // the pod of job
gpuCount int64
requestedGPU int64
allocatedGPU int64
trainerType string // return trainer type: MPI, STANDALONE, TENSORFLOW
}
func (ji *JobInfo) Name() string {
return ji.name
}
func (ji *JobInfo) Trainer() string {
return ji.trainerType
}
// Get the chief Pod of the Job.
func (ji *JobInfo) ChiefPod() v1.Pod {
return ji.jobPod
}
// Get all the pods of the Training Job
func (ji *JobInfo) AllPods() []v1.Pod {
return ji.pods
}
// Get the hostIP of the chief Pod
func (ji *JobInfo) HostIPOfChief() (hostIP string) {
hostIP = "N/A"
if ji.GetStatus() == "RUNNING" {
hostIP = ji.jobPod.Status.HostIP
}
return hostIP
}
// Requested GPU count of the Job
func (ji *JobInfo) RequestedGPU() int64 {
if ji.requestedGPU > 0 {
return ji.requestedGPU
}
for _, pod := range ji.pods {
ji.requestedGPU += gpuInPod(pod)
}
return ji.requestedGPU
}
// Requested GPU count of the Job
func (ji *JobInfo) AllocatedGPU() int64 {
if ji.allocatedGPU > 0 {
return ji.allocatedGPU
}
for _, pod := range ji.pods {
ji.allocatedGPU += gpuInActivePod(pod)
}
return ji.allocatedGPU
}
func (ji *JobInfo) Age() time.Duration {
job := ji.job
if job.Status.StartTime == nil ||
job.Status.StartTime.IsZero() {
return 0
}
return metav1.Now().Sub(job.Status.StartTime.Time)
}
// Get the Job Training Duration
func (ji *JobInfo) Duration() time.Duration {
job := ji.job
if job.Status.StartTime == nil ||
job.Status.StartTime.IsZero() {
return 0
}
if job.Status.CompletionTime != nil {
return job.Status.CompletionTime.Time.Sub(job.Status.StartTime.Time)
}
if ji.GetStatus() == "FAILED" {
cond := getPodLatestCondition(ji.ChiefPod())
if !cond.LastTransitionTime.IsZero() {
return cond.LastTransitionTime.Time.Sub(job.Status.StartTime.Time)
} else {
log.Debugf("the latest condition's time is zero of pod %s", ji.ChiefPod().Name)
}
}
return metav1.Now().Sub(job.Status.StartTime.Time)
}
func (ji *JobInfo) StartTime() *metav1.Time {
return ji.job.Status.StartTime
}
// Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (ji *JobInfo) GetStatus() (status string) {
job := ji.job
pod := ji.jobPod
if job.Status.Active > 0 {
status = "RUNNING"
} else if job.Status.Succeeded > 0 {
status = "SUCCEEDED"
} else if job.Status.Failed > 0 {
status = "FAILED"
}
if status == "RUNNING" {
hostIP := pod.Status.HostIP
if hostIP == "" {
status = "PENDING"
} else if pod.Status.Phase == v1.PodPending {
status = "PENDING"
}
}
return status
}
func (ji *JobInfo) Namespace() string {
return ji.job.Namespace
}