Skip to content

Commit c5a282e

Browse files
Davidlohr Buesotorvalds
authored andcommitted
fs/epoll: reduce the scope of wq lock in epoll_wait()
This patch aims at reducing ep wq.lock hold times in epoll_wait(2). For the blocking case, there is no need to constantly take and drop the spinlock, which is only needed to manipulate the waitqueue. The call to ep_events_available() is now lockless, and only exposed to benign races. Here, if false positive (returns available events and does not see another thread deleting an epi from the list) we call into send_events and then the list's state is correctly seen. Otoh, if a false negative and we don't see a list_add_tail(), for example, from irq callback, then it is rechecked again before blocking, which will see the correct state. In order for more accuracy to see concurrent list_del_init(), use the list_empty_careful() variant -- of course, this won't be safe against insertions from wakeup. For the overflow list we obviously need to prevent load/store tearing as we don't want to see partial values while the ready list is disabled. [dave@stgolabs.net: forgotten fixlets] Link: http://lkml.kernel.org/r/20181109155258.jxcr4t2pnz6zqct3@linux-r8p5 Link: http://lkml.kernel.org/r/20181108051006.18751-6-dave@stgolabs.net Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Suggested-by: Jason Baron <jbaron@akamai.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 21877e1 commit c5a282e

File tree

1 file changed

+60
-54
lines changed

1 file changed

+60
-54
lines changed

fs/eventpoll.c

Lines changed: 60 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
381381
*/
382382
static inline int ep_events_available(struct eventpoll *ep)
383383
{
384-
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
384+
return !list_empty_careful(&ep->rdllist) ||
385+
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
385386
}
386387

387388
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -698,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
698699
*/
699700
spin_lock_irq(&ep->wq.lock);
700701
list_splice_init(&ep->rdllist, &txlist);
701-
ep->ovflist = NULL;
702+
WRITE_ONCE(ep->ovflist, NULL);
702703
spin_unlock_irq(&ep->wq.lock);
703704

704705
/*
@@ -712,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
712713
* other events might have been queued by the poll callback.
713714
* We re-insert them inside the main ready-list here.
714715
*/
715-
for (nepi = ep->ovflist; (epi = nepi) != NULL;
716+
for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
716717
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
717718
/*
718719
* We need to check if the item is already in the list.
@@ -730,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
730731
* releasing the lock, events will be queued in the normal way inside
731732
* ep->rdllist.
732733
*/
733-
ep->ovflist = EP_UNACTIVE_PTR;
734+
WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
734735

735736
/*
736737
* Quickly re-inject items left on "txlist".
@@ -1153,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
11531154
* semantics). All the events that happen during that period of time are
11541155
* chained in ep->ovflist and requeued later on.
11551156
*/
1156-
if (ep->ovflist != EP_UNACTIVE_PTR) {
1157+
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
11571158
if (epi->next == EP_UNACTIVE_PTR) {
1158-
epi->next = ep->ovflist;
1159-
ep->ovflist = epi;
1159+
epi->next = READ_ONCE(ep->ovflist);
1160+
WRITE_ONCE(ep->ovflist, epi);
11601161
if (epi->ws) {
11611162
/*
11621163
* Activate ep->ws since epi->ws may get
@@ -1762,10 +1763,17 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
17621763
} else if (timeout == 0) {
17631764
/*
17641765
* Avoid the unnecessary trip to the wait queue loop, if the
1765-
* caller specified a non blocking operation.
1766+
* caller specified a non blocking operation. We still need
1767+
* lock because we could race and not see an epi being added
1768+
* to the ready list while in irq callback. Thus incorrectly
1769+
* returning 0 back to userspace.
17661770
*/
17671771
timed_out = 1;
1772+
17681773
spin_lock_irq(&ep->wq.lock);
1774+
eavail = ep_events_available(ep);
1775+
spin_unlock_irq(&ep->wq.lock);
1776+
17691777
goto check_events;
17701778
}
17711779

@@ -1774,64 +1782,62 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
17741782
if (!ep_events_available(ep))
17751783
ep_busy_loop(ep, timed_out);
17761784

1785+
eavail = ep_events_available(ep);
1786+
if (eavail)
1787+
goto check_events;
1788+
1789+
/*
1790+
* Busy poll timed out. Drop NAPI ID for now, we can add
1791+
* it back in when we have moved a socket with a valid NAPI
1792+
* ID onto the ready list.
1793+
*/
1794+
ep_reset_busy_poll_napi_id(ep);
1795+
1796+
/*
1797+
* We don't have any available event to return to the caller.
1798+
* We need to sleep here, and we will be wake up by
1799+
* ep_poll_callback() when events will become available.
1800+
*/
1801+
init_waitqueue_entry(&wait, current);
17771802
spin_lock_irq(&ep->wq.lock);
1803+
__add_wait_queue_exclusive(&ep->wq, &wait);
1804+
spin_unlock_irq(&ep->wq.lock);
17781805

1779-
if (!ep_events_available(ep)) {
1806+
for (;;) {
17801807
/*
1781-
* Busy poll timed out. Drop NAPI ID for now, we can add
1782-
* it back in when we have moved a socket with a valid NAPI
1783-
* ID onto the ready list.
1808+
* We don't want to sleep if the ep_poll_callback() sends us
1809+
* a wakeup in between. That's why we set the task state
1810+
* to TASK_INTERRUPTIBLE before doing the checks.
17841811
*/
1785-
ep_reset_busy_poll_napi_id(ep);
1786-
1812+
set_current_state(TASK_INTERRUPTIBLE);
17871813
/*
1788-
* We don't have any available event to return to the caller.
1789-
* We need to sleep here, and we will be wake up by
1790-
* ep_poll_callback() when events will become available.
1814+
* Always short-circuit for fatal signals to allow
1815+
* threads to make a timely exit without the chance of
1816+
* finding more events available and fetching
1817+
* repeatedly.
17911818
*/
1792-
init_waitqueue_entry(&wait, current);
1793-
__add_wait_queue_exclusive(&ep->wq, &wait);
1794-
1795-
for (;;) {
1796-
/*
1797-
* We don't want to sleep if the ep_poll_callback() sends us
1798-
* a wakeup in between. That's why we set the task state
1799-
* to TASK_INTERRUPTIBLE before doing the checks.
1800-
*/
1801-
set_current_state(TASK_INTERRUPTIBLE);
1802-
/*
1803-
* Always short-circuit for fatal signals to allow
1804-
* threads to make a timely exit without the chance of
1805-
* finding more events available and fetching
1806-
* repeatedly.
1807-
*/
1808-
if (fatal_signal_pending(current)) {
1809-
res = -EINTR;
1810-
break;
1811-
}
1812-
if (ep_events_available(ep) || timed_out)
1813-
break;
1814-
if (signal_pending(current)) {
1815-
res = -EINTR;
1816-
break;
1817-
}
1818-
1819-
spin_unlock_irq(&ep->wq.lock);
1820-
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1821-
timed_out = 1;
1822-
1823-
spin_lock_irq(&ep->wq.lock);
1819+
if (fatal_signal_pending(current)) {
1820+
res = -EINTR;
1821+
break;
1822+
}
1823+
if (ep_events_available(ep) || timed_out)
1824+
break;
1825+
if (signal_pending(current)) {
1826+
res = -EINTR;
1827+
break;
18241828
}
18251829

1826-
__remove_wait_queue(&ep->wq, &wait);
1827-
__set_current_state(TASK_RUNNING);
1830+
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1831+
timed_out = 1;
18281832
}
1829-
check_events:
1830-
/* Is it worth to try to dig for events ? */
1831-
eavail = ep_events_available(ep);
18321833

1834+
__set_current_state(TASK_RUNNING);
1835+
1836+
spin_lock_irq(&ep->wq.lock);
1837+
__remove_wait_queue(&ep->wq, &wait);
18331838
spin_unlock_irq(&ep->wq.lock);
18341839

1840+
check_events:
18351841
/*
18361842
* Try to transfer events to user space. In case we get 0 events and
18371843
* there's still timeout left over, we go trying again in search of

0 commit comments

Comments
 (0)