-
Notifications
You must be signed in to change notification settings - Fork 348
/
task.go
400 lines (350 loc) · 12.6 KB
/
task.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
package model
import (
"fmt"
"strconv"
"strings"
"time"
"github.com/google/uuid"
"github.com/uptrace/bun"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/determined-ai/determined/proto/pkg/apiv1"
"github.com/determined-ai/determined/proto/pkg/logv1"
"github.com/determined-ai/determined/proto/pkg/taskv1"
)
// TaskID is the unique ID of a task among all tasks.
type TaskID string
// NewTaskID returns a random, globally unique task ID.
func NewTaskID() TaskID {
return TaskID(uuid.New().String())
}
// TaskType is the type of a task.
type TaskType string
func (a TaskID) String() string {
return string(a)
}
const (
// TaskTypeTrial is the "TRIAL" job type for the enum public.job_type in Postgres.
TaskTypeTrial TaskType = "TRIAL"
// TaskTypeNotebook is the "NOTEBOOK" job type for the enum public.job_type in Postgres.
TaskTypeNotebook TaskType = "NOTEBOOK"
// TaskTypeShell is the "SHELL" job type for the enum public.job_type in Postgres.
TaskTypeShell TaskType = "SHELL"
// TaskTypeCommand is the "COMMAND" job type for the enum public.job_type in Postgres.
TaskTypeCommand TaskType = "COMMAND"
// TaskTypeTensorboard is the "TENSORBOARD" task type for the enum.task_type in Postgres.
TaskTypeTensorboard TaskType = "TENSORBOARD"
// TaskTypeCheckpointGC is the "CHECKPOINT_GC" job type for the enum public.job_type in Postgres.
TaskTypeCheckpointGC TaskType = "CHECKPOINT_GC"
)
// TaskLogVersion is the version for our log-storing scheme. Useful because changing designs
// would involve either a really costly migration or versioning schemes and we pick the latter.
type TaskLogVersion int32
// CurrentTaskLogVersion describes the current scheme in which we store task
// logs. To avoid a migration that in some cases would be extremely
// costly, we record the log version so that we can just read old logs
// the old way and do the new however we please.
const (
TaskLogVersion0 TaskLogVersion = 0
TaskLogVersion1 TaskLogVersion = 1
CurrentTaskLogVersion = TaskLogVersion1
)
// Task is the model for a task in the database.
type Task struct {
bun.BaseModel `bun:"table:tasks"`
TaskID TaskID `db:"task_id" bun:"task_id,pk"`
JobID *JobID `db:"job_id"`
TaskType TaskType `db:"task_type"`
StartTime time.Time `db:"start_time"`
EndTime *time.Time `db:"end_time"`
// LogVersion indicates how the logs were stored.
LogVersion TaskLogVersion `db:"log_version"`
// Relations.
Job *Job `bun:"rel:belongs-to,join:job_id=job_id"`
}
// AllocationID is the ID of an allocation of a task. It is usually of the form
// TaskID.allocation_number, maybe with some other metadata if different types of
// allocations run.
type AllocationID string
func (a AllocationID) String() string {
return string(a)
}
// ToTaskID converts an AllocationID to its taskID.
func (a AllocationID) ToTaskID() TaskID {
return TaskID(a[:strings.LastIndex(string(a), ".")])
}
// Allocation is the model for an allocation in the database.
type Allocation struct {
bun.BaseModel `bun:"table:allocations"`
AllocationID AllocationID `db:"allocation_id" bun:"allocation_id,pk"`
TaskID TaskID `db:"task_id" bun:"task_id,notnull"`
Slots int `db:"slots" bun:"slots,notnull"`
ResourcePool string `db:"resource_pool" bun:"resource_pool,notnull"`
StartTime *time.Time `db:"start_time" bun:"start_time"`
EndTime *time.Time `db:"end_time" bun:"end_time"`
State *AllocationState `db:"state" bun:"state"`
IsReady *bool `db:"is_ready" bun:"is_ready"`
Ports map[string]int `db:"ports" bun:"ports,notnull"`
}
// AllocationState represents the current state of the task. Value indicates a partial ordering.
type AllocationState string
// TaskStats is the model for task stats in the database.
type TaskStats struct {
AllocationID AllocationID
EventType string
StartTime *time.Time
EndTime *time.Time
}
// ResourceAggregates is the model for resource_aggregates in the database.
type ResourceAggregates struct {
Date *time.Time
AggregationType string
AggregationKey string
Seconds float32
}
const (
// AllocationStatePending state denotes that the command is awaiting allocation.
AllocationStatePending AllocationState = "PENDING"
// AllocationStateWaiting state denotes that the command is waiting on data.
AllocationStateWaiting AllocationState = "WAITING"
// AllocationStateAssigned state denotes that the command has been assigned to an agent but has
// not started yet.
AllocationStateAssigned AllocationState = "ASSIGNED"
// AllocationStatePulling state denotes that the command's base image is being pulled from the
// Docker registry.
AllocationStatePulling AllocationState = "PULLING"
// AllocationStateStarting state denotes that the image has been pulled and the task is being
// started, but the task is not ready yet.
AllocationStateStarting AllocationState = "STARTING"
// AllocationStateRunning state denotes that the service in the command is running.
AllocationStateRunning AllocationState = "RUNNING"
// AllocationStateTerminated state denotes that the command has exited or has been aborted.
AllocationStateTerminated AllocationState = "TERMINATED"
// AllocationStateTerminating state denotes that the command is terminating.
AllocationStateTerminating AllocationState = "TERMINATING"
)
// MostProgressedAllocationState returns the further progressed state. E.G. a call
// with PENDING, PULLING and STARTING returns PULLING.
func MostProgressedAllocationState(states ...AllocationState) AllocationState {
if len(states) == 0 {
return AllocationStatePending
}
// Can't use taskv1.State_value[state] since in proto
// "STATE_TERMINATING" > "STATE_TERMINATED"
// while our model used to have
// "STATE_TERMINATED" > "STATE_TERMINATING".
statesToOrder := map[AllocationState]int{
AllocationStatePending: 0,
AllocationStateAssigned: 1,
AllocationStatePulling: 2,
AllocationStateStarting: 3,
AllocationStateRunning: 4,
AllocationStateWaiting: 5,
AllocationStateTerminating: 6,
AllocationStateTerminated: 7,
}
maxOrder, state := statesToOrder[states[0]], states[0]
for _, s := range states {
if order := statesToOrder[s]; order > maxOrder {
maxOrder, state = order, s
}
}
return state
}
// Proto returns the proto representation of the task state.
func (s AllocationState) Proto() taskv1.State {
switch s {
case AllocationStateWaiting:
return taskv1.State_STATE_WAITING
case AllocationStatePulling:
return taskv1.State_STATE_PULLING
case AllocationStateStarting:
return taskv1.State_STATE_STARTING
case AllocationStateRunning:
return taskv1.State_STATE_RUNNING
case AllocationStateTerminating:
return taskv1.State_STATE_TERMINATING
case AllocationStateTerminated:
return taskv1.State_STATE_TERMINATED
default:
return taskv1.State_STATE_UNSPECIFIED
}
}
const (
defaultTaskLogContainer = "UNKNOWN CONTAINER"
defaultTaskLogTime = "UNKNOWN TIME"
// LogLevelTrace is the trace task log level.
LogLevelTrace = "TRACE"
// LogLevelDebug is the debug task log level.
LogLevelDebug = "DEBUG"
// LogLevelInfo is the info task log level.
LogLevelInfo = "INFO"
// LogLevelWarning is the warn task log level.
LogLevelWarning = "WARNING"
// LogLevelError is the error task log level.
LogLevelError = "ERROR"
// LogLevelCritical is the critical task log level.
LogLevelCritical = "CRITICAL"
// LogLevelUnspecified is the unspecified task log level.
LogLevelUnspecified = "UNSPECIFIED"
)
// TaskLogLevelFromProto returns a task log level from its protobuf repr.
func TaskLogLevelFromProto(l logv1.LogLevel) string {
switch l {
case logv1.LogLevel_LOG_LEVEL_UNSPECIFIED:
return LogLevelUnspecified
case logv1.LogLevel_LOG_LEVEL_TRACE:
return LogLevelTrace
case logv1.LogLevel_LOG_LEVEL_DEBUG:
return LogLevelDebug
case logv1.LogLevel_LOG_LEVEL_INFO:
return LogLevelInfo
case logv1.LogLevel_LOG_LEVEL_WARNING:
return LogLevelWarning
case logv1.LogLevel_LOG_LEVEL_ERROR:
return LogLevelError
case logv1.LogLevel_LOG_LEVEL_CRITICAL:
return LogLevelCritical
default:
return LogLevelUnspecified
}
}
// TaskLogLevelToProto returns a protobuf task log level from its string repr.
func TaskLogLevelToProto(l string) logv1.LogLevel {
switch l {
case LogLevelTrace:
return logv1.LogLevel_LOG_LEVEL_TRACE
case LogLevelDebug:
return logv1.LogLevel_LOG_LEVEL_DEBUG
case LogLevelInfo:
return logv1.LogLevel_LOG_LEVEL_INFO
case LogLevelWarning:
return logv1.LogLevel_LOG_LEVEL_WARNING
case LogLevelError:
return logv1.LogLevel_LOG_LEVEL_ERROR
case LogLevelCritical:
return logv1.LogLevel_LOG_LEVEL_CRITICAL
default:
return logv1.LogLevel_LOG_LEVEL_UNSPECIFIED
}
}
// TaskLog represents a structured log emitted by an allocation.
type TaskLog struct {
// A task log should have one of these IDs after being persisted. All should be unique.
ID *int `db:"id" json:"id,omitempty"`
// The body of an Elasticsearch log response will look something like
// { _id: ..., _source: { ... }} where _source is the rest of this struct.
// StringID doesn't have serialization tags because it is not part of
// _source and populated from _id.
StringID *string `json:"-"`
TaskID string `db:"task_id" json:"task_id"`
AllocationID *string `db:"allocation_id" json:"allocation_id"`
AgentID *string `db:"agent_id" json:"agent_id,omitempty"`
// In the case of k8s, container_id is a pod name instead.
ContainerID *string `db:"container_id" json:"container_id,omitempty"`
RankID *int `db:"rank_id" json:"rank_id,omitempty"`
Timestamp *time.Time `db:"timestamp" json:"timestamp"`
Level *string `db:"level" json:"level"`
Log string `db:"log" json:"log"`
Source *string `db:"source" json:"source,omitempty"`
StdType *string `db:"stdtype" json:"stdtype,omitempty"`
}
const (
// RFC3339MicroTrailingZeroes unlike time.RFC3339Nano is a time format specifier that preserves
// trailing zeroes.
RFC3339MicroTrailingZeroes = "2006-01-02T15:04:05.000000Z07:00"
// containerIDMaxLength is the max display length for a container ID in logs.
containerIDMaxLength = 8
)
// Message resolves the flat version of the log that UIs have shown historically.
// TODO(task-unif): Should we just.. stop doing this? And send the log as is and let the
// UIs handle display (yes, IMO).
func (t *TaskLog) Message() string {
var parts []string
// e.g., "[2022-03-02T02:15:18.299569Z]"
if t.Timestamp != nil {
parts = append(parts, fmt.Sprintf("[%s]", t.Timestamp.Format(RFC3339MicroTrailingZeroes)))
} else {
parts = append(parts, fmt.Sprintf("[%s]", defaultTaskLogTime))
}
// e.g., " f6114bb3"
if t.ContainerID != nil && *t.ContainerID != "" {
containerID := *t.ContainerID
if len(containerID) > containerIDMaxLength {
containerID = containerID[:containerIDMaxLength]
}
parts = append(parts, containerID)
} else {
// Just so the logs visually line up.
parts = append(parts, strings.Repeat(" ", containerIDMaxLength))
}
// e.g., " [rank=1]"
if t.RankID != nil {
parts = append(parts, fmt.Sprintf("[rank=%d]", *t.RankID))
}
parts = append(parts, ("||"))
// e.g., " INFO"
if t.Level != nil {
parts = append(parts, fmt.Sprintf("%s:", *t.Level))
}
parts = append(parts, t.Log)
return strings.Join(parts, " ")
}
// Proto converts a task log to its protobuf representation.
func (t TaskLog) Proto() (*apiv1.TaskLogsResponse, error) {
var id string
switch {
case t.ID != nil:
id = strconv.Itoa(*t.ID)
case t.StringID != nil:
id = *t.StringID
default:
panic("log had no valid ID")
}
var ts *timestamppb.Timestamp
if t.Timestamp != nil {
ts = timestamppb.New(*t.Timestamp)
}
var level logv1.LogLevel
if t.Level == nil {
level = logv1.LogLevel_LOG_LEVEL_UNSPECIFIED
} else {
level = TaskLogLevelToProto(*t.Level)
}
resp := &apiv1.TaskLogsResponse{
Id: id,
TaskId: t.TaskID,
Timestamp: ts,
Level: level,
Message: t.Message(),
Log: t.Log,
AllocationId: t.AllocationID,
AgentId: t.AgentID,
ContainerId: t.ContainerID,
Source: t.Source,
Stdtype: t.StdType,
}
if t.RankID != nil {
id := int32(*t.RankID)
resp.RankId = &id
}
return resp, nil
}
// TaskLogBatch represents a batch of model.TaskLog.
type TaskLogBatch []*TaskLog
// Size implements logs.Batch.
func (t TaskLogBatch) Size() int {
return len(t)
}
// ForEach implements logs.Batch.
func (t TaskLogBatch) ForEach(f func(interface{}) error) error {
for _, tl := range t {
if err := f(tl); err != nil {
return err
}
}
return nil
}
// AccessScopeID is an identifier for an access scope.
type AccessScopeID int
// AccessScopeSet is a set of access scopes.
type AccessScopeSet = map[AccessScopeID]bool