Skip to content

Commit a0c5393

Browse files
committed
Merge branch 'lockless-qdisc-packet-stuck'
Yunsheng Lin says: ==================== ix packet stuck problem for lockless qdisc This patchset fixes the packet stuck problem mentioned in [1]. Patch 1: Add STATE_MISSED flag to fix packet stuck problem. Patch 2: Fix a tx_action rescheduling problem after STATE_MISSED flag is added in patch 1. Patch 3: Fix the significantly higher CPU consumption problem when multiple threads are competing on a saturated outgoing device. V8: Change function name as suggested by Jakub and fix some typo in patch 3, adjust commit log in patch 2, and add Acked-by from Jakub. V7: Fix netif_tx_wake_queue() data race noted by Jakub. V6: Some performance optimization in patch 1 suggested by Jakub and drop NET_XMIT_DROP checking in patch 3. V5: add patch 3 to fix the problem reported by Michal Kubecek. V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED and add patch 2. [1]. https://lkml.org/lkml/2019/10/9/42 ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents 974271e + dcad9ee commit a0c5393

File tree

4 files changed

+107
-14
lines changed

4 files changed

+107
-14
lines changed

include/net/pkt_sched.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q);
128128
static inline void qdisc_run(struct Qdisc *q)
129129
{
130130
if (qdisc_run_begin(q)) {
131-
/* NOLOCK qdisc must check 'state' under the qdisc seqlock
132-
* to avoid racing with dev_qdisc_reset()
133-
*/
134-
if (!(q->flags & TCQ_F_NOLOCK) ||
135-
likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
136-
__qdisc_run(q);
131+
__qdisc_run(q);
137132
qdisc_run_end(q);
138133
}
139134
}

include/net/sch_generic.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
3636
enum qdisc_state_t {
3737
__QDISC_STATE_SCHED,
3838
__QDISC_STATE_DEACTIVATED,
39+
__QDISC_STATE_MISSED,
3940
};
4041

4142
struct qdisc_size_table {
@@ -159,8 +160,33 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
159160
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
160161
{
161162
if (qdisc->flags & TCQ_F_NOLOCK) {
163+
if (spin_trylock(&qdisc->seqlock))
164+
goto nolock_empty;
165+
166+
/* If the MISSED flag is set, it means other thread has
167+
* set the MISSED flag before second spin_trylock(), so
168+
* we can return false here to avoid multi cpus doing
169+
* the set_bit() and second spin_trylock() concurrently.
170+
*/
171+
if (test_bit(__QDISC_STATE_MISSED, &qdisc->state))
172+
return false;
173+
174+
/* Set the MISSED flag before the second spin_trylock(),
175+
* if the second spin_trylock() return false, it means
176+
* other cpu holding the lock will do dequeuing for us
177+
* or it will see the MISSED flag set after releasing
178+
* lock and reschedule the net_tx_action() to do the
179+
* dequeuing.
180+
*/
181+
set_bit(__QDISC_STATE_MISSED, &qdisc->state);
182+
183+
/* Retry again in case other CPU may not see the new flag
184+
* after it releases the lock at the end of qdisc_run_end().
185+
*/
162186
if (!spin_trylock(&qdisc->seqlock))
163187
return false;
188+
189+
nolock_empty:
164190
WRITE_ONCE(qdisc->empty, false);
165191
} else if (qdisc_is_running(qdisc)) {
166192
return false;
@@ -176,8 +202,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
176202
static inline void qdisc_run_end(struct Qdisc *qdisc)
177203
{
178204
write_seqcount_end(&qdisc->running);
179-
if (qdisc->flags & TCQ_F_NOLOCK)
205+
if (qdisc->flags & TCQ_F_NOLOCK) {
180206
spin_unlock(&qdisc->seqlock);
207+
208+
if (unlikely(test_bit(__QDISC_STATE_MISSED,
209+
&qdisc->state))) {
210+
clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
211+
__netif_schedule(qdisc);
212+
}
213+
}
181214
}
182215

183216
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)

net/core/dev.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3853,7 +3853,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
38533853

38543854
if (q->flags & TCQ_F_NOLOCK) {
38553855
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3856-
qdisc_run(q);
3856+
if (likely(!netif_xmit_frozen_or_stopped(txq)))
3857+
qdisc_run(q);
38573858

38583859
if (unlikely(to_free))
38593860
kfree_skb_list(to_free);
@@ -5025,25 +5026,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
50255026
sd->output_queue_tailp = &sd->output_queue;
50265027
local_irq_enable();
50275028

5029+
rcu_read_lock();
5030+
50285031
while (head) {
50295032
struct Qdisc *q = head;
50305033
spinlock_t *root_lock = NULL;
50315034

50325035
head = head->next_sched;
50335036

5034-
if (!(q->flags & TCQ_F_NOLOCK)) {
5035-
root_lock = qdisc_lock(q);
5036-
spin_lock(root_lock);
5037-
}
50385037
/* We need to make sure head->next_sched is read
50395038
* before clearing __QDISC_STATE_SCHED
50405039
*/
50415040
smp_mb__before_atomic();
5041+
5042+
if (!(q->flags & TCQ_F_NOLOCK)) {
5043+
root_lock = qdisc_lock(q);
5044+
spin_lock(root_lock);
5045+
} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5046+
&q->state))) {
5047+
/* There is a synchronize_net() between
5048+
* STATE_DEACTIVATED flag being set and
5049+
* qdisc_reset()/some_qdisc_is_busy() in
5050+
* dev_deactivate(), so we can safely bail out
5051+
* early here to avoid data race between
5052+
* qdisc_deactivate() and some_qdisc_is_busy()
5053+
* for lockless qdisc.
5054+
*/
5055+
clear_bit(__QDISC_STATE_SCHED, &q->state);
5056+
continue;
5057+
}
5058+
50425059
clear_bit(__QDISC_STATE_SCHED, &q->state);
50435060
qdisc_run(q);
50445061
if (root_lock)
50455062
spin_unlock(root_lock);
50465063
}
5064+
5065+
rcu_read_unlock();
50475066
}
50485067

50495068
xfrm_dev_backlog(sd);

net/sched/sch_generic.c

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,25 @@
3535
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
3636
EXPORT_SYMBOL(default_qdisc_ops);
3737

38+
static void qdisc_maybe_clear_missed(struct Qdisc *q,
39+
const struct netdev_queue *txq)
40+
{
41+
clear_bit(__QDISC_STATE_MISSED, &q->state);
42+
43+
/* Make sure the below netif_xmit_frozen_or_stopped()
44+
* checking happens after clearing STATE_MISSED.
45+
*/
46+
smp_mb__after_atomic();
47+
48+
/* Checking netif_xmit_frozen_or_stopped() again to
49+
* make sure STATE_MISSED is set if the STATE_MISSED
50+
* set by netif_tx_wake_queue()'s rescheduling of
51+
* net_tx_action() is cleared by the above clear_bit().
52+
*/
53+
if (!netif_xmit_frozen_or_stopped(txq))
54+
set_bit(__QDISC_STATE_MISSED, &q->state);
55+
}
56+
3857
/* Main transmission queue. */
3958

4059
/* Modifications to data participating in scheduling must be protected with
@@ -74,6 +93,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
7493
}
7594
} else {
7695
skb = SKB_XOFF_MAGIC;
96+
qdisc_maybe_clear_missed(q, txq);
7797
}
7898
}
7999

@@ -242,6 +262,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
242262
}
243263
} else {
244264
skb = NULL;
265+
qdisc_maybe_clear_missed(q, txq);
245266
}
246267
if (lock)
247268
spin_unlock(lock);
@@ -251,8 +272,10 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
251272
*validate = true;
252273

253274
if ((q->flags & TCQ_F_ONETXQUEUE) &&
254-
netif_xmit_frozen_or_stopped(txq))
275+
netif_xmit_frozen_or_stopped(txq)) {
276+
qdisc_maybe_clear_missed(q, txq);
255277
return skb;
278+
}
256279

257280
skb = qdisc_dequeue_skb_bad_txq(q);
258281
if (unlikely(skb)) {
@@ -311,6 +334,8 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
311334
HARD_TX_LOCK(dev, txq, smp_processor_id());
312335
if (!netif_xmit_frozen_or_stopped(txq))
313336
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
337+
else
338+
qdisc_maybe_clear_missed(q, txq);
314339

315340
HARD_TX_UNLOCK(dev, txq);
316341
} else {
@@ -640,8 +665,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
640665
{
641666
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
642667
struct sk_buff *skb = NULL;
668+
bool need_retry = true;
643669
int band;
644670

671+
retry:
645672
for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
646673
struct skb_array *q = band2list(priv, band);
647674

@@ -652,6 +679,23 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
652679
}
653680
if (likely(skb)) {
654681
qdisc_update_stats_at_dequeue(qdisc, skb);
682+
} else if (need_retry &&
683+
test_bit(__QDISC_STATE_MISSED, &qdisc->state)) {
684+
/* Delay clearing the STATE_MISSED here to reduce
685+
* the overhead of the second spin_trylock() in
686+
* qdisc_run_begin() and __netif_schedule() calling
687+
* in qdisc_run_end().
688+
*/
689+
clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
690+
691+
/* Make sure dequeuing happens after clearing
692+
* STATE_MISSED.
693+
*/
694+
smp_mb__after_atomic();
695+
696+
need_retry = false;
697+
698+
goto retry;
655699
} else {
656700
WRITE_ONCE(qdisc->empty, true);
657701
}
@@ -1158,8 +1202,10 @@ static void dev_reset_queue(struct net_device *dev,
11581202
qdisc_reset(qdisc);
11591203

11601204
spin_unlock_bh(qdisc_lock(qdisc));
1161-
if (nolock)
1205+
if (nolock) {
1206+
clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
11621207
spin_unlock_bh(&qdisc->seqlock);
1208+
}
11631209
}
11641210

11651211
static bool some_qdisc_is_busy(struct net_device *dev)

0 commit comments

Comments
 (0)