Skip to content

Commit ac53db5

Browse files
Rik van RielIngo Molnar
authored andcommitted
sched: Use a buddy to implement yield_task_fair()
Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
1 parent 2c13c91 commit ac53db5

File tree

5 files changed

+90
-71
lines changed

5 files changed

+90
-71
lines changed

include/linux/sched.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1942,8 +1942,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
19421942
void __user *buffer, size_t *lenp,
19431943
loff_t *ppos);
19441944

1945-
extern unsigned int sysctl_sched_compat_yield;
1946-
19471945
#ifdef CONFIG_SCHED_AUTOGROUP
19481946
extern unsigned int sysctl_sched_autogroup_enabled;
19491947

kernel/sched.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ struct cfs_rq {
324324
* 'curr' points to currently running entity on this cfs_rq.
325325
* It is set to NULL otherwise (i.e when none are currently running).
326326
*/
327-
struct sched_entity *curr, *next, *last;
327+
struct sched_entity *curr, *next, *last, *skip;
328328

329329
unsigned int nr_spread_over;
330330

kernel/sched_debug.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179179

180180
raw_spin_lock_irqsave(&rq->lock, flags);
181181
if (cfs_rq->rb_leftmost)
182-
MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
182+
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183183
last = __pick_last_entity(cfs_rq);
184184
if (last)
185185
max_vruntime = last->vruntime;

kernel/sched_fair.c

Lines changed: 88 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8;
6868
*/
6969
unsigned int sysctl_sched_child_runs_first __read_mostly;
7070

71-
/*
72-
* sys_sched_yield() compat mode
73-
*
74-
* This option switches the agressive yield implementation of the
75-
* old scheduler back on.
76-
*/
77-
unsigned int __read_mostly sysctl_sched_compat_yield;
78-
7971
/*
8072
* SCHED_OTHER wake-up granularity.
8173
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419411
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420412
}
421413

422-
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
414+
static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423415
{
424416
struct rb_node *left = cfs_rq->rb_leftmost;
425417

@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429421
return rb_entry(left, struct sched_entity, run_node);
430422
}
431423

424+
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425+
{
426+
struct rb_node *next = rb_next(&se->run_node);
427+
428+
if (!next)
429+
return NULL;
430+
431+
return rb_entry(next, struct sched_entity, run_node);
432+
}
433+
434+
#ifdef CONFIG_SCHED_DEBUG
432435
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433436
{
434437
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443446
* Scheduling class statistics methods:
444447
*/
445448

446-
#ifdef CONFIG_SCHED_DEBUG
447449
int sched_proc_update_handler(struct ctl_table *table, int write,
448450
void __user *buffer, size_t *lenp,
449451
loff_t *ppos)
@@ -1017,13 +1019,27 @@ static void __clear_buddies_next(struct sched_entity *se)
10171019
}
10181020
}
10191021

1022+
static void __clear_buddies_skip(struct sched_entity *se)
1023+
{
1024+
for_each_sched_entity(se) {
1025+
struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026+
if (cfs_rq->skip == se)
1027+
cfs_rq->skip = NULL;
1028+
else
1029+
break;
1030+
}
1031+
}
1032+
10201033
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
10211034
{
10221035
if (cfs_rq->last == se)
10231036
__clear_buddies_last(se);
10241037

10251038
if (cfs_rq->next == se)
10261039
__clear_buddies_next(se);
1040+
1041+
if (cfs_rq->skip == se)
1042+
__clear_buddies_skip(se);
10271043
}
10281044

10291045
static void
@@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
10991115
return;
11001116

11011117
if (cfs_rq->nr_running > 1) {
1102-
struct sched_entity *se = __pick_next_entity(cfs_rq);
1118+
struct sched_entity *se = __pick_first_entity(cfs_rq);
11031119
s64 delta = curr->vruntime - se->vruntime;
11041120

11051121
if (delta < 0)
@@ -1143,20 +1159,40 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
11431159
static int
11441160
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
11451161

1162+
/*
1163+
* Pick the next process, keeping these things in mind, in this order:
1164+
* 1) keep things fair between processes/task groups
1165+
* 2) pick the "next" process, since someone really wants that to run
1166+
* 3) pick the "last" process, for cache locality
1167+
* 4) do not run the "skip" process, if something else is available
1168+
*/
11461169
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
11471170
{
1148-
struct sched_entity *se = __pick_next_entity(cfs_rq);
1171+
struct sched_entity *se = __pick_first_entity(cfs_rq);
11491172
struct sched_entity *left = se;
11501173

1151-
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1152-
se = cfs_rq->next;
1174+
/*
1175+
* Avoid running the skip buddy, if running something else can
1176+
* be done without getting too unfair.
1177+
*/
1178+
if (cfs_rq->skip == se) {
1179+
struct sched_entity *second = __pick_next_entity(se);
1180+
if (second && wakeup_preempt_entity(second, left) < 1)
1181+
se = second;
1182+
}
11531183

11541184
/*
11551185
* Prefer last buddy, try to return the CPU to a preempted task.
11561186
*/
11571187
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
11581188
se = cfs_rq->last;
11591189

1190+
/*
1191+
* Someone really wants this to run. If it's not unfair, run it.
1192+
*/
1193+
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194+
se = cfs_rq->next;
1195+
11601196
clear_buddies(cfs_rq, se);
11611197

11621198
return se;
@@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
13331369
hrtick_update(rq);
13341370
}
13351371

1336-
/*
1337-
* sched_yield() support is very simple - we dequeue and enqueue.
1338-
*
1339-
* If compat_yield is turned on then we requeue to the end of the tree.
1340-
*/
1341-
static void yield_task_fair(struct rq *rq)
1342-
{
1343-
struct task_struct *curr = rq->curr;
1344-
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1345-
struct sched_entity *rightmost, *se = &curr->se;
1346-
1347-
/*
1348-
* Are we the only task in the tree?
1349-
*/
1350-
if (unlikely(rq->nr_running == 1))
1351-
return;
1352-
1353-
clear_buddies(cfs_rq, se);
1354-
1355-
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1356-
update_rq_clock(rq);
1357-
/*
1358-
* Update run-time statistics of the 'current'.
1359-
*/
1360-
update_curr(cfs_rq);
1361-
1362-
return;
1363-
}
1364-
/*
1365-
* Find the rightmost entry in the rbtree:
1366-
*/
1367-
rightmost = __pick_last_entity(cfs_rq);
1368-
/*
1369-
* Already in the rightmost position?
1370-
*/
1371-
if (unlikely(!rightmost || entity_before(rightmost, se)))
1372-
return;
1373-
1374-
/*
1375-
* Minimally necessary key value to be last in the tree:
1376-
* Upon rescheduling, sched_class::put_prev_task() will place
1377-
* 'current' within the tree based on its new key value.
1378-
*/
1379-
se->vruntime = rightmost->vruntime + 1;
1380-
}
1381-
13821372
#ifdef CONFIG_SMP
13831373

13841374
static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
18491839
}
18501840
}
18511841

1842+
static void set_skip_buddy(struct sched_entity *se)
1843+
{
1844+
if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845+
for_each_sched_entity(se)
1846+
cfs_rq_of(se)->skip = se;
1847+
}
1848+
}
1849+
18521850
/*
18531851
* Preempt the current task with a newly woken task if needed:
18541852
*/
@@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
19471945
}
19481946
}
19491947

1948+
/*
1949+
* sched_yield() is very simple
1950+
*
1951+
* The magic of dealing with the ->skip buddy is in pick_next_entity.
1952+
*/
1953+
static void yield_task_fair(struct rq *rq)
1954+
{
1955+
struct task_struct *curr = rq->curr;
1956+
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1957+
struct sched_entity *se = &curr->se;
1958+
1959+
/*
1960+
* Are we the only task in the tree?
1961+
*/
1962+
if (unlikely(rq->nr_running == 1))
1963+
return;
1964+
1965+
clear_buddies(cfs_rq, se);
1966+
1967+
if (curr->policy != SCHED_BATCH) {
1968+
update_rq_clock(rq);
1969+
/*
1970+
* Update run-time statistics of the 'current'.
1971+
*/
1972+
update_curr(cfs_rq);
1973+
}
1974+
1975+
set_skip_buddy(se);
1976+
}
1977+
19501978
#ifdef CONFIG_SMP
19511979
/**************************************************
19521980
* Fair scheduling class load-balancing methods:

kernel/sysctl.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = {
360360
.mode = 0644,
361361
.proc_handler = sched_rt_handler,
362362
},
363-
{
364-
.procname = "sched_compat_yield",
365-
.data = &sysctl_sched_compat_yield,
366-
.maxlen = sizeof(unsigned int),
367-
.mode = 0644,
368-
.proc_handler = proc_dointvec,
369-
},
370363
#ifdef CONFIG_SCHED_AUTOGROUP
371364
{
372365
.procname = "sched_autogroup_enabled",

0 commit comments

Comments
 (0)