/
ghost_uapi.h
500 lines (433 loc) · 13 KB
/
ghost_uapi.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
/*
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _SCHED_GHOST_H_
#define _SCHED_GHOST_H_
#include <linux/ioctl.h>
#ifndef __KERNEL__
#include <limits.h>
#include <stdint.h>
#endif
// NOLINTBEGIN
// clang-format off
/*
* The version of ghOSt. It is important that the kernel and the userspace
* process are the same version as each other. Each successive version changes
* values in this header file, assumptions about operations in the kernel, etc.
*/
#define GHOST_VERSION 42
/*
* Define SCHED_GHOST via the ghost uapi unless it has already been defined
* via the proper channels (i.e. the official <sched.h> uapi header file).
*/
#ifndef SCHED_GHOST
#define SCHED_GHOST 18
#endif
/*
* This is userspace API so the usual suspects like __cacheline_aligned
* will not work.
*
* TODO: these are architecture-specific.
*/
#define __ghost_cacheline_aligned __attribute__((__aligned__(64)))
#define __ghost_page_aligned __attribute__((__aligned__(4096)))
enum ghost_type {
GHOST_AGENT,
GHOST_TASK,
};
enum {
GHOST_SCHED_TASK_PRIO,
GHOST_SCHED_AGENT_PRIO,
GHOST_SCHED_MAX_PRIO,
};
/*
* Collateral for GHOST_CONFIG_QUEUE_WAKEUP op:
* - kernel supports up to 'GHOST_MAX_WINFO' wakeup targets for a queue.
* - 'prio' must be zero (intended to be a tiebreaker but not defined yet).
*/
#define GHOST_MAX_WINFO 2
struct ghost_agent_wakeup {
int cpu;
int prio;
};
/*
* <type,arg> tuple decoding:
* 'type' is GHOST_TASK -> 'arg' is <gtid>
* 'type' is GHOST_AGENT -> 'arg' is <cpu>
*/
struct ghost_msg_src {
enum ghost_type type;
uint64_t arg;
};
#define GHOST_THIS_CPU -1
struct ghost_sw_info {
uint32_t id; /* status_word region id */
uint32_t index; /* index into the status_word array */
};
struct ghost_ioc_sw_get_info {
struct ghost_msg_src request;
struct ghost_sw_info response;
};
#define GHOST_IOC_NULL _IO('g', 0)
#define GHOST_IOC_SW_GET_INFO _IOWR('g', 1, struct ghost_ioc_sw_get_info *)
#define GHOST_IOC_SW_FREE _IOW('g', 2, struct ghost_sw_info *)
/*
* Status word region APIs.
*/
struct ghost_sw_region_header {
uint32_t version; /* ABI version */
uint32_t id; /* region id */
uint32_t numa_node;
uint32_t start; /* offset of the first status word */
uint32_t capacity; /* total status words in this region */
uint32_t available; /* free status words in this region */
} __ghost_cacheline_aligned;
#define GHOST_SW_REGION_VERSION 0
/*
* Align status words up to the next power of two so we do not cross cache lines
* unnecessarily. At our current size, that is one cache line per status word,
* and two status words per cache line.
*/
struct ghost_status_word {
uint32_t barrier;
uint32_t flags;
uint64_t gtid;
int64_t switch_time; /* time at which task was context-switched onto CPU */
uint64_t runtime; /* total time spent on the CPU in nsecs */
} __attribute__((__aligned__(32)));
#define GHOST_SW_F_INUSE (1U << 0) /* status_word in use */
#define GHOST_SW_F_CANFREE (1U << 1) /* status_word can be freed */
/* agent-specific status_word.flags */
#define GHOST_SW_CPU_AVAIL (1U << 8) /* CPU available hint */
#define GHOST_SW_BOOST_PRIO (1U << 9) /* agent with boosted prio */
/* task-specific status_word.flags */
#define GHOST_SW_TASK_ONCPU (1U << 16) /* task is oncpu */
#define GHOST_SW_TASK_RUNNABLE (1U << 17) /* task is runnable */
#define GHOST_SW_TASK_IS_AGENT (1U << 18)
/*
* Queue APIs.
*/
struct ghost_queue_header {
uint32_t version; /* ABI version */
uint32_t start; /* offset from the header to start of ring */
uint32_t nelems; /* power-of-2 size of ghost_ring.msgs[] */
} __ghost_cacheline_aligned;
#define GHOST_QUEUE_VERSION 0
struct ghost_msg {
uint16_t type; /* message type */
uint16_t length; /* length of this message including payload */
uint32_t seqnum; /* sequence number for this msg source */
uint32_t payload[0]; /* variable length payload */
};
/*
* Messages are grouped by type and each type can have up to 64 messages.
* (the limit of 64 is arbitrary).
*/
#define _MSG_TASK_FIRST 64
#define _MSG_TASK_LAST (_MSG_TASK_FIRST + 64 - 1)
#define _MSG_CPU_FIRST 128
#define _MSG_CPU_LAST (_MSG_CPU_FIRST + 64 - 1)
/* ghost message types */
enum {
/* misc msgs */
MSG_NOP = 0,
/* task messages */
MSG_TASK_DEAD = _MSG_TASK_FIRST,
MSG_TASK_BLOCKED,
MSG_TASK_WAKEUP,
MSG_TASK_NEW,
MSG_TASK_PREEMPT,
MSG_TASK_YIELD,
MSG_TASK_DEPARTED,
MSG_TASK_SWITCHTO,
MSG_TASK_AFFINITY_CHANGED,
MSG_TASK_LATCHED,
/* cpu messages */
MSG_CPU_TICK = _MSG_CPU_FIRST,
MSG_CPU_TIMER_EXPIRED,
MSG_CPU_NOT_IDLE, /* requested via run_flags: NEED_CPU_NOT_IDLE */
};
/* TODO: Move payload to header once all clients updated. */
struct ghost_msg_payload_task_new {
uint64_t gtid;
uint64_t runtime; /* cumulative runtime in ns */
uint16_t runnable;
struct ghost_sw_info sw_info;
};
struct ghost_msg_payload_task_preempt {
uint64_t gtid;
uint64_t runtime; /* cumulative runtime in ns */
uint64_t cpu_seqnum; /* cpu sequence number */
int cpu;
char from_switchto;
char was_latched;
};
struct ghost_msg_payload_task_yield {
uint64_t gtid;
uint64_t runtime; /* cumulative runtime in ns */
uint64_t cpu_seqnum;
int cpu;
char from_switchto;
};
struct ghost_msg_payload_task_blocked {
uint64_t gtid;
uint64_t runtime; /* cumulative runtime in ns */
uint64_t cpu_seqnum;
int cpu;
char from_switchto;
};
struct ghost_msg_payload_task_dead {
uint64_t gtid;
};
struct ghost_msg_payload_task_departed {
uint64_t gtid;
uint64_t cpu_seqnum;
int cpu;
char from_switchto;
char was_current;
};
struct ghost_msg_payload_task_affinity_changed {
uint64_t gtid;
};
struct ghost_msg_payload_task_wakeup {
uint64_t gtid;
char deferrable; /* bool: 0 or 1 */
int last_ran_cpu; /*
* CPU that task last ran on (may be different
* than where it was last scheduled by the
* agent due to switchto).
*/
int wake_up_cpu; /*
* CPU where the task was woken up (this is
* typically where the task last ran but it
* may also be the waker's cpu).
*/
int waker_cpu; /* CPU of the waker task */
};
struct ghost_msg_payload_task_switchto {
uint64_t gtid;
uint64_t runtime; /* cumulative runtime in ns */
uint64_t cpu_seqnum;
int cpu;
};
struct ghost_msg_payload_task_latched {
uint64_t gtid;
uint64_t commit_time;
uint64_t cpu_seqnum;
int cpu;
char latched_preempt;
};
struct ghost_msg_payload_cpu_not_idle {
int cpu;
uint64_t next_gtid;
};
struct ghost_msg_payload_cpu_tick {
int cpu;
};
struct ghost_msg_payload_timer {
int cpu;
uint64_t cookie;
};
#ifdef __cplusplus
#include <atomic>
typedef std::atomic<uint32_t> _ghost_ring_index_t;
#else
typedef volatile uint32_t _ghost_ring_index_t;
#endif
struct ghost_ring {
/*
* kernel produces at 'head & (nelems-1)' and
* agent consumes from 'tail & (nelems-1)'.
*
* kernel increments 'overflow' any time there aren't enough
* free slots to produce a message.
*/
_ghost_ring_index_t head;
_ghost_ring_index_t tail;
_ghost_ring_index_t overflow;
struct ghost_msg msgs[0]; /* array of size 'header->nelems' */
};
#define GHOST_MAX_QUEUE_ELEMS 65536 /* arbitrary */
/*
* Define ghOSt syscall numbers here until they can be discovered via
* <unistd.h>.
*/
#ifndef __NR_ghost_run
#define __NR_ghost_run 450
#endif
#ifndef __NR_ghost
#define __NR_ghost 451
#endif
/*
* 'ops' supported by gsys_ghost().
*/
enum ghost_ops {
GHOST_NULL,
GHOST_CREATE_QUEUE,
GHOST_ASSOCIATE_QUEUE,
GHOST_CONFIG_QUEUE_WAKEUP,
GHOST_SET_OPTION,
GHOST_GET_CPU_TIME,
GHOST_COMMIT_TXN,
GHOST_SYNC_GROUP_TXN,
GHOST_TIMERFD_SETTIME,
GHOST_GTID_LOOKUP,
GHOST_GET_GTID_10, /* TODO: deprecate */
GHOST_GET_GTID_11, /* TODO: deprecate */
GHOST_SET_DEFAULT_QUEUE,
};
/*
* 'ops' supported by gsys_ghost() that are used by the 'base' library in
* userspace. Arbitrary applications may use the 'base' library requiring
* these 'ops' to be immutable (both in terms of the entry-point and the
* functionality).
*
* The 'base' library can detect support for a particular op as follows:
* - return value 0 indicates that the 'op' was successful.
* - return value of -1 indicates that 'op' was not successful:
* - ENOSYS: kernel does not support ghost.
* - EOPNOTSUPP: kernel supports ghost but doesn't recognize this op.
* (for e.g. newer binary running on an older kernel).
* - any other errno indicates an op-specific error.
*/
enum ghost_base_ops {
_GHOST_BASE_OP_FIRST = 65536, /* avoid overlap with 'ghost_ops' */
GHOST_BASE_GET_GTID = _GHOST_BASE_OP_FIRST,
/*
* New ops must be added to the end of this enumeration.
*/
};
/* status flags for GHOST_ASSOCIATE_QUEUE */
#define GHOST_ASSOC_SF_ALREADY (1 << 0) /* Queue already set */
#define GHOST_ASSOC_SF_BRAND_NEW (1 << 1) /* TASK_NEW not sent yet */
/* flags accepted by ghost_run() */
#define RTLA_ON_PREEMPT (1 << 0) /* Return To Local Agent on preemption */
#define RTLA_ON_BLOCKED (1 << 1) /* Return To Local Agent on block */
#define RTLA_ON_YIELD (1 << 2) /* Return To Local Agent on yield */
#define RTLA_ON_IDLE (1 << 5) /* Return To Local Agent on idle */
#define NEED_L1D_FLUSH (1 << 6) /* Flush L1 dcache before entering guest */
#define NEED_CPU_NOT_IDLE (1 << 7) /* Notify agent when a non-idle task is
* scheduled on the cpu.
*/
#define ELIDE_PREEMPT (1 << 9) /* Do not send TASK_PREEMPT if we preempt
* a previous ghost task on this cpu
*/
#define SEND_TASK_LATCHED (1 << 10) /* Send TASK_LATCHED at commit time */
/* txn->commit_flags */
#define COMMIT_AT_SCHEDULE (1 << 0) /* commit when oncpu task schedules */
#define COMMIT_AT_TXN_COMMIT (1 << 1) /* commit in GHOST_COMMIT_TXN op */
#define ALLOW_TASK_ONCPU (1 << 2) /* If task is running on a remote cpu
* then let continue running there.
*/
/* Union of all COMMIT_AT_XYZ flags */
#define COMMIT_AT_FLAGS (COMMIT_AT_SCHEDULE | COMMIT_AT_TXN_COMMIT)
/* special 'gtid' encodings that can be passed to ghost_run() */
#define GHOST_NULL_GTID (0)
#define GHOST_AGENT_GTID (-1)
#define GHOST_IDLE_GTID (-2)
/* ghost transaction */
/*
* (txn->state == GHOST_TXN_READY) transaction is ready
* (txn->state >= 0 && txn->state < nr_cpu_ids) transaction is claimed
* (txn->state < 0) transaction is committed
*/
enum ghost_txn_state {
GHOST_TXN_COMPLETE = INT_MIN,
GHOST_TXN_ABORTED,
GHOST_TXN_TARGET_ONCPU,
GHOST_TXN_TARGET_STALE,
GHOST_TXN_TARGET_NOT_FOUND,
GHOST_TXN_TARGET_NOT_RUNNABLE,
GHOST_TXN_AGENT_STALE,
GHOST_TXN_CPU_OFFLINE,
GHOST_TXN_CPU_UNAVAIL,
GHOST_TXN_INVALID_FLAGS,
GHOST_TXN_INVALID_TARGET,
GHOST_TXN_NOT_PERMITTED,
GHOST_TXN_INVALID_CPU,
GHOST_TXN_NO_AGENT,
GHOST_TXN_UNSUPPORTED_VERSION,
GHOST_TXN_POISONED,
/*
* Values [0-nr_cpu_ids) indicates that the transaction is claimed
* by the specific CPU.
*/
GHOST_TXN_READY = INT_MAX,
};
/*
* _ghost_txn_state_t is not expected to be used anywhere except as the type
* of ghost_txn::state below. See cl/350179823 as an example of unintentional
* consequences.
*/
#ifdef __cplusplus
#include <atomic>
typedef std::atomic<ghost_txn_state> _ghost_txn_state_t;
/*
* To be safe, check that the size of '_ghost_txn_state_t' is equal to the size
* of 'int32_t', which is the size that the kernel assumes the state is.
*/
static_assert(sizeof(_ghost_txn_state_t) == sizeof(int32_t));
typedef std::atomic<int32_t> _ghost_txn_owner_t;
#else
typedef int32_t _ghost_txn_state_t;
typedef int32_t _ghost_txn_owner_t;
#endif
struct ghost_txn {
int32_t version;
int32_t cpu; /* readonly-after-init */
_ghost_txn_state_t state;
uint32_t agent_barrier;
uint32_t task_barrier;
uint16_t run_flags;
uint8_t commit_flags;
uint8_t unused;
int64_t gtid;
int64_t commit_time; /* the time that the txn commit succeeded/failed */
uint64_t cpu_seqnum;
/*
* Context-dependent fields.
*/
union {
/* only used during a sync-group commit */
_ghost_txn_owner_t sync_group_owner;
} u;
} __ghost_cacheline_aligned;
struct ghost_cpu_data {
struct ghost_txn txn;
} __ghost_page_aligned;
#define GHOST_TXN_VERSION 0
/* GHOST_TIMERFD_SETTIME */
struct timerfd_ghost {
int cpu;
int flags;
uint64_t cookie;
};
#define TIMERFD_GHOST_ENABLED (1 << 0)
/* GHOST_GTID_LOOKUP */
enum {
GHOST_GTID_LOOKUP_TGID, /* return group_leader pid */
};
/*
* ghost tids referring to normal tasks always have a positive value:
* (0 | 22 bits of actual pid_t | 41 bit non-zero seqnum)
*
* The embedded 'pid' following linux terminology is actually referring
* to the thread id (i.e. what would be returned by syscall(__NR_gettid)).
*/
#define GHOST_TID_SEQNUM_BITS 41
#define GHOST_TID_PID_BITS 22
// clang-format on
// NOLINTEND
#endif /* _SCHED_GHOST_H_ */