Skip to content

Commit 9af774b

Browse files
Tuong Liengregkh
authored andcommitted
tipc: fix unlimited bundling of small messages
[ Upstream commit e95584a ] We have identified a problem with the "oversubscription" policy in the link transmission code. When small messages are transmitted, and the sending link has reached the transmit window limit, those messages will be bundled and put into the link backlog queue. However, bundles of data messages are counted at the 'CRITICAL' level, so that the counter for that level, instead of the counter for the real, bundled message's level is the one being increased. Subsequent, to-be-bundled data messages at non-CRITICAL levels continue to be tested against the unchanged counter for their own level, while contributing to an unrestrained increase at the CRITICAL backlog level. This leaves a gap in congestion control algorithm for small messages that can result in starvation for other users or a "real" CRITICAL user. Even that eventually can lead to buffer exhaustion & link reset. We fix this by keeping a 'target_bskb' buffer pointer at each levels, then when bundling, we only bundle messages at the same importance level only. This way, we know exactly how many slots a certain level have occupied in the queue, so can manage level congestion accurately. By bundling messages at the same level, we even have more benefits. Let consider this: - One socket sends 64-byte messages at the 'CRITICAL' level; - Another sends 4096-byte messages at the 'LOW' level; When a 64-byte message comes and is bundled the first time, we put the overhead of message bundle to it (+ 40-byte header, data copy, etc.) for later use, but the next message can be a 4096-byte one that cannot be bundled to the previous one. This means the last bundle carries only one payload message which is totally inefficient, as for the receiver also! Later on, another 64-byte message comes, now we make a new bundle and the same story repeats... With the new bundling algorithm, this will not happen, the 64-byte messages will be bundled together even when the 4096-byte message(s) comes in between. However, if the 4096-byte messages are sent at the same level i.e. 'CRITICAL', the bundling algorithm will again cause the same overhead. Also, the same will happen even with only one socket sending small messages at a rate close to the link transmit's one, so that, when one message is bundled, it's transmitted shortly. Then, another message comes, a new bundle is created and so on... We will solve this issue radically by another patch. Fixes: 365ad35 ("tipc: reduce risk of user starvation during link congestion") Reported-by: Hoang Le <hoang.h.le@dektech.com.au> Acked-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent c01fc24 commit 9af774b

File tree

2 files changed

+19
-15
lines changed

2 files changed

+19
-15
lines changed

net/tipc/link.c

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ struct tipc_link {
163163
struct {
164164
u16 len;
165165
u16 limit;
166+
struct sk_buff *target_bskb;
166167
} backlog[5];
167168
u16 snd_nxt;
168169
u16 prev_from;
@@ -872,6 +873,7 @@ static void link_prepare_wakeup(struct tipc_link *l)
872873
void tipc_link_reset(struct tipc_link *l)
873874
{
874875
struct sk_buff_head list;
876+
u32 imp;
875877

876878
__skb_queue_head_init(&list);
877879

@@ -893,11 +895,10 @@ void tipc_link_reset(struct tipc_link *l)
893895
__skb_queue_purge(&l->deferdq);
894896
__skb_queue_purge(&l->backlogq);
895897
__skb_queue_purge(&l->failover_deferdq);
896-
l->backlog[TIPC_LOW_IMPORTANCE].len = 0;
897-
l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0;
898-
l->backlog[TIPC_HIGH_IMPORTANCE].len = 0;
899-
l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
900-
l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
898+
for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) {
899+
l->backlog[imp].len = 0;
900+
l->backlog[imp].target_bskb = NULL;
901+
}
901902
kfree_skb(l->reasm_buf);
902903
kfree_skb(l->failover_reasm_skb);
903904
l->reasm_buf = NULL;
@@ -938,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
938939
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
939940
struct sk_buff_head *transmq = &l->transmq;
940941
struct sk_buff_head *backlogq = &l->backlogq;
941-
struct sk_buff *skb, *_skb, *bskb;
942+
struct sk_buff *skb, *_skb, **tskb;
942943
int pkt_cnt = skb_queue_len(list);
943944
int rc = 0;
944945

@@ -988,19 +989,21 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
988989
seqno++;
989990
continue;
990991
}
991-
if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) {
992+
tskb = &l->backlog[imp].target_bskb;
993+
if (tipc_msg_bundle(*tskb, hdr, mtu)) {
992994
kfree_skb(__skb_dequeue(list));
993995
l->stats.sent_bundled++;
994996
continue;
995997
}
996-
if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) {
998+
if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) {
997999
kfree_skb(__skb_dequeue(list));
998-
__skb_queue_tail(backlogq, bskb);
999-
l->backlog[msg_importance(buf_msg(bskb))].len++;
1000+
__skb_queue_tail(backlogq, *tskb);
1001+
l->backlog[imp].len++;
10001002
l->stats.sent_bundled++;
10011003
l->stats.sent_bundles++;
10021004
continue;
10031005
}
1006+
l->backlog[imp].target_bskb = NULL;
10041007
l->backlog[imp].len += skb_queue_len(list);
10051008
skb_queue_splice_tail_init(list, backlogq);
10061009
}
@@ -1016,6 +1019,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
10161019
u16 seqno = l->snd_nxt;
10171020
u16 ack = l->rcv_nxt - 1;
10181021
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
1022+
u32 imp;
10191023

10201024
while (skb_queue_len(&l->transmq) < l->window) {
10211025
skb = skb_peek(&l->backlogq);
@@ -1026,7 +1030,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
10261030
break;
10271031
__skb_dequeue(&l->backlogq);
10281032
hdr = buf_msg(skb);
1029-
l->backlog[msg_importance(hdr)].len--;
1033+
imp = msg_importance(hdr);
1034+
l->backlog[imp].len--;
1035+
if (unlikely(skb == l->backlog[imp].target_bskb))
1036+
l->backlog[imp].target_bskb = NULL;
10301037
__skb_queue_tail(&l->transmq, skb);
10311038
/* next retransmit attempt */
10321039
if (link_is_bc_sndlink(l))

net/tipc/msg.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -484,10 +484,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
484484
bmsg = buf_msg(_skb);
485485
tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0,
486486
INT_H_SIZE, dnode);
487-
if (msg_isdata(msg))
488-
msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE);
489-
else
490-
msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE);
487+
msg_set_importance(bmsg, msg_importance(msg));
491488
msg_set_seqno(bmsg, msg_seqno(msg));
492489
msg_set_ack(bmsg, msg_ack(msg));
493490
msg_set_bcast_ack(bmsg, msg_bcast_ack(msg));

0 commit comments

Comments
 (0)