Skip to content

Commit 133e89e

Browse files
Davidlohr BuesoIngo Molnar
authored andcommitted
locking/rwsem: Enable lockless waiter wakeup(s)
As wake_qs gain users, we can teach rwsems about them such that waiters can be awoken without the wait_lock. This is for both readers and writer, the former being the most ideal candidate as we can batch the wakeups shortening the critical region that much more -- ie writer task blocking a bunch of tasks waiting to service page-faults (mmap_sem readers). In general applying wake_qs to rwsem (xadd) is not difficult as the wait_lock is intended to be released soon _anyways_, with the exception of when a writer slowpath will proactively wakeup any queued readers if it sees that the lock is owned by a reader, in which we simply do the wakeups with the lock held (see comment in __rwsem_down_write_failed_common()). Similar to other locking primitives, delaying the waiter being awoken does allow, at least in theory, the lock to be stolen in the case of writers, however no harm was seen in this (in fact lock stealing tends to be a _good_ thing in most workloads), and this is a tiny window anyways. Some page-fault (pft) and mmap_sem intensive benchmarks show some pretty constant reduction in systime (by up to ~8 and ~10%) on a 2-socket, 12 core AMD box. In addition, on an 8-core Westmere doing page allocations (page_test) aim9: 4.6-rc6 4.6-rc6 rwsemv2 Min page_test 378167.89 ( 0.00%) 382613.33 ( 1.18%) Min exec_test 499.00 ( 0.00%) 502.67 ( 0.74%) Min fork_test 3395.47 ( 0.00%) 3537.64 ( 4.19%) Hmean page_test 395433.06 ( 0.00%) 414693.68 ( 4.87%) Hmean exec_test 499.67 ( 0.00%) 505.30 ( 1.13%) Hmean fork_test 3504.22 ( 0.00%) 3594.95 ( 2.59%) Stddev page_test 17426.57 ( 0.00%) 26649.92 (-52.93%) Stddev exec_test 0.47 ( 0.00%) 1.41 (-199.05%) Stddev fork_test 63.74 ( 0.00%) 32.59 ( 48.86%) Max page_test 429873.33 ( 0.00%) 456960.00 ( 6.30%) Max exec_test 500.33 ( 0.00%) 507.66 ( 1.47%) Max fork_test 3653.33 ( 0.00%) 3650.90 ( -0.07%) 4.6-rc6 4.6-rc6 rwsemv2 User 1.12 0.04 System 0.23 0.04 Elapsed 727.27 721.98 Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 0422e83 commit 133e89e

File tree

1 file changed

+42
-16
lines changed

1 file changed

+42
-16
lines changed

kernel/locking/rwsem-xadd.c

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
114114
* - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
115115
* - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
116116
* - there must be someone on the queue
117-
* - the spinlock must be held by the caller
117+
* - the wait_lock must be held by the caller
118+
* - tasks are marked for wakeup, the caller must later invoke wake_up_q()
119+
* to actually wakeup the blocked task(s) and drop the reference count,
120+
* preferably when the wait_lock is released
118121
* - woken process blocks are discarded from the list after having task zeroed
119-
* - writers are only woken if downgrading is false
122+
* - writers are only marked woken if downgrading is false
120123
*/
121124
static struct rw_semaphore *
122-
__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
125+
__rwsem_mark_wake(struct rw_semaphore *sem,
126+
enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
123127
{
124128
struct rwsem_waiter *waiter;
125129
struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
128132

129133
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
130134
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
131-
if (wake_type == RWSEM_WAKE_ANY)
132-
/* Wake writer at the front of the queue, but do not
133-
* grant it the lock yet as we want other writers
134-
* to be able to steal it. Readers, on the other hand,
135-
* will block as they will notice the queued writer.
135+
if (wake_type == RWSEM_WAKE_ANY) {
136+
/*
137+
* Mark writer at the front of the queue for wakeup.
138+
* Until the task is actually later awoken later by
139+
* the caller, other writers are able to steal it.
140+
* Readers, on the other hand, will block as they
141+
* will notice the queued writer.
136142
*/
137-
wake_up_process(waiter->task);
143+
wake_q_add(wake_q, waiter->task);
144+
}
138145
goto out;
139146
}
140147

@@ -196,7 +203,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
196203
*/
197204
smp_mb();
198205
waiter->task = NULL;
199-
wake_up_process(tsk);
206+
wake_q_add(wake_q, tsk);
200207
put_task_struct(tsk);
201208
} while (--loop);
202209

@@ -216,6 +223,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
216223
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
217224
struct rwsem_waiter waiter;
218225
struct task_struct *tsk = current;
226+
WAKE_Q(wake_q);
219227

220228
/* set up my own style of waitqueue */
221229
waiter.task = tsk;
@@ -238,9 +246,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
238246
if (count == RWSEM_WAITING_BIAS ||
239247
(count > RWSEM_WAITING_BIAS &&
240248
adjustment != -RWSEM_ACTIVE_READ_BIAS))
241-
sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
249+
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
242250

243251
raw_spin_unlock_irq(&sem->wait_lock);
252+
wake_up_q(&wake_q);
244253

245254
/* wait to be given the lock */
246255
while (true) {
@@ -440,6 +449,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
440449
bool waiting = true; /* any queued threads before us */
441450
struct rwsem_waiter waiter;
442451
struct rw_semaphore *ret = sem;
452+
WAKE_Q(wake_q);
443453

444454
/* undo write bias from down_write operation, stop active locking */
445455
count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -472,8 +482,19 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
472482
* no active writers, the lock must be read owned; so we try to
473483
* wake any read locks that were queued ahead of us.
474484
*/
475-
if (count > RWSEM_WAITING_BIAS)
476-
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
485+
if (count > RWSEM_WAITING_BIAS) {
486+
WAKE_Q(wake_q);
487+
488+
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
489+
/*
490+
* The wakeup is normally called _after_ the wait_lock
491+
* is released, but given that we are proactively waking
492+
* readers we can deal with the wake_q overhead as it is
493+
* similar to releasing and taking the wait_lock again
494+
* for attempting rwsem_try_write_lock().
495+
*/
496+
wake_up_q(&wake_q);
497+
}
477498

478499
} else
479500
count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -509,8 +530,9 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
509530
if (list_empty(&sem->wait_list))
510531
rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
511532
else
512-
__rwsem_do_wake(sem, RWSEM_WAKE_ANY);
533+
__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
513534
raw_spin_unlock_irq(&sem->wait_lock);
535+
wake_up_q(&wake_q);
514536

515537
return ERR_PTR(-EINTR);
516538
}
@@ -537,6 +559,7 @@ __visible
537559
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
538560
{
539561
unsigned long flags;
562+
WAKE_Q(wake_q);
540563

541564
/*
542565
* If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +596,10 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
573596

574597
/* do nothing if list empty */
575598
if (!list_empty(&sem->wait_list))
576-
sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
599+
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
577600

578601
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
602+
wake_up_q(&wake_q);
579603

580604
return sem;
581605
}
@@ -590,14 +614,16 @@ __visible
590614
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
591615
{
592616
unsigned long flags;
617+
WAKE_Q(wake_q);
593618

594619
raw_spin_lock_irqsave(&sem->wait_lock, flags);
595620

596621
/* do nothing if list empty */
597622
if (!list_empty(&sem->wait_list))
598-
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
623+
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
599624

600625
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
626+
wake_up_q(&wake_q);
601627

602628
return sem;
603629
}

0 commit comments

Comments
 (0)