Skip to content

Commit 5ee35ab

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf: Remove recursion check for struct_ops prog'
Martin KaFai Lau says: ==================== From: Martin KaFai Lau <martin.lau@kernel.org> The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. The kernel does not call the tcp-cc's ops in a recursive way, so this set is to remove the recursion check for struct_ops prog. v3: - Clear the bpf_chg_cc_inprogress from the newly cloned tcp_sock in tcp_create_openreq_child() because the listen sk can be cloned without lock being held. (Eric Dumazet) v2: - v1 [0] turned into a long discussion on a few cases and also whether it needs to follow the bpf_run_ctx chain if there is tracing bpf_run_ctx (kprobe/trace/trampoline) running in between. It is a good signal that it is not obvious enough to reason about it and needs a tradeoff for a more straight forward approach. This revision uses one bit out of an existing 1 byte hole in the tcp_sock. It is in Patch 4. [0]: https://lore.kernel.org/bpf/20220922225616.3054840-1-kafai@fb.com/T/#md98d40ac5ec295fdadef476c227a3401b2b6b911 ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2 parents 8526f0d + 3411c5b commit 5ee35ab

File tree

8 files changed

+112
-24
lines changed

8 files changed

+112
-24
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
18361836
if (p->aux->sleepable) {
18371837
enter = __bpf_prog_enter_sleepable;
18381838
exit = __bpf_prog_exit_sleepable;
1839+
} else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
1840+
enter = __bpf_prog_enter_struct_ops;
1841+
exit = __bpf_prog_exit_struct_ops;
18391842
} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
18401843
enter = __bpf_prog_enter_lsm_cgroup;
18411844
exit = __bpf_prog_exit_lsm_cgroup;

include/linux/bpf.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
864864
struct bpf_tramp_run_ctx *run_ctx);
865865
void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
866866
struct bpf_tramp_run_ctx *run_ctx);
867+
u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
868+
struct bpf_tramp_run_ctx *run_ctx);
869+
void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
870+
struct bpf_tramp_run_ctx *run_ctx);
867871
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
868872
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
869873

include/linux/tcp.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,12 @@ struct tcp_sock {
388388
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
389389
* values defined in uapi/linux/tcp.h
390390
*/
391+
u8 bpf_chg_cc_inprogress:1; /* In the middle of
392+
* bpf_setsockopt(TCP_CONGESTION),
393+
* it is to avoid the bpf_tcp_cc->init()
394+
* to recur itself by calling
395+
* bpf_setsockopt(TCP_CONGESTION, "itself").
396+
*/
391397
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
392398
#else
393399
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0

kernel/bpf/trampoline.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
964964
rcu_read_unlock_trace();
965965
}
966966

967+
u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
968+
struct bpf_tramp_run_ctx *run_ctx)
969+
__acquires(RCU)
970+
{
971+
rcu_read_lock();
972+
migrate_disable();
973+
974+
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
975+
976+
return bpf_prog_start_time();
977+
}
978+
979+
void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
980+
struct bpf_tramp_run_ctx *run_ctx)
981+
__releases(RCU)
982+
{
983+
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
984+
985+
update_prog_stats(prog, start);
986+
migrate_enable();
987+
rcu_read_unlock();
988+
}
989+
967990
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
968991
{
969992
percpu_ref_get(&tr->pcref);

net/core/filter.c

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5102,6 +5102,59 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
51025102
return 0;
51035103
}
51045104

5105+
static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
5106+
int *optlen, bool getopt)
5107+
{
5108+
struct tcp_sock *tp;
5109+
int ret;
5110+
5111+
if (*optlen < 2)
5112+
return -EINVAL;
5113+
5114+
if (getopt) {
5115+
if (!inet_csk(sk)->icsk_ca_ops)
5116+
return -EINVAL;
5117+
/* BPF expects NULL-terminated tcp-cc string */
5118+
optval[--(*optlen)] = '\0';
5119+
return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
5120+
KERNEL_SOCKPTR(optval),
5121+
KERNEL_SOCKPTR(optlen));
5122+
}
5123+
5124+
/* "cdg" is the only cc that alloc a ptr
5125+
* in inet_csk_ca area. The bpf-tcp-cc may
5126+
* overwrite this ptr after switching to cdg.
5127+
*/
5128+
if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
5129+
return -ENOTSUPP;
5130+
5131+
/* It stops this looping
5132+
*
5133+
* .init => bpf_setsockopt(tcp_cc) => .init =>
5134+
* bpf_setsockopt(tcp_cc)" => .init => ....
5135+
*
5136+
* The second bpf_setsockopt(tcp_cc) is not allowed
5137+
* in order to break the loop when both .init
5138+
* are the same bpf prog.
5139+
*
5140+
* This applies even the second bpf_setsockopt(tcp_cc)
5141+
* does not cause a loop. This limits only the first
5142+
* '.init' can call bpf_setsockopt(TCP_CONGESTION) to
5143+
* pick a fallback cc (eg. peer does not support ECN)
5144+
* and the second '.init' cannot fallback to
5145+
* another.
5146+
*/
5147+
tp = tcp_sk(sk);
5148+
if (tp->bpf_chg_cc_inprogress)
5149+
return -EBUSY;
5150+
5151+
tp->bpf_chg_cc_inprogress = 1;
5152+
ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
5153+
KERNEL_SOCKPTR(optval), *optlen);
5154+
tp->bpf_chg_cc_inprogress = 0;
5155+
return ret;
5156+
}
5157+
51055158
static int sol_tcp_sockopt(struct sock *sk, int optname,
51065159
char *optval, int *optlen,
51075160
bool getopt)
@@ -5125,9 +5178,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
51255178
return -EINVAL;
51265179
break;
51275180
case TCP_CONGESTION:
5128-
if (*optlen < 2)
5129-
return -EINVAL;
5130-
break;
5181+
return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
51315182
case TCP_SAVED_SYN:
51325183
if (*optlen < 1)
51335184
return -EINVAL;
@@ -5152,13 +5203,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
51525203
return 0;
51535204
}
51545205

5155-
if (optname == TCP_CONGESTION) {
5156-
if (!inet_csk(sk)->icsk_ca_ops)
5157-
return -EINVAL;
5158-
/* BPF expects NULL-terminated tcp-cc string */
5159-
optval[--(*optlen)] = '\0';
5160-
}
5161-
51625206
return do_tcp_getsockopt(sk, SOL_TCP, optname,
51635207
KERNEL_SOCKPTR(optval),
51645208
KERNEL_SOCKPTR(optlen));
@@ -5285,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
52855329
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
52865330
int, optname, char *, optval, int, optlen)
52875331
{
5288-
if (level == SOL_TCP && optname == TCP_CONGESTION) {
5289-
if (optlen >= sizeof("cdg") - 1 &&
5290-
!strncmp("cdg", optval, optlen))
5291-
return -ENOTSUPP;
5292-
}
5293-
52945332
return _bpf_setsockopt(sk, level, optname, optval, optlen);
52955333
}
52965334

net/ipv4/tcp_minisocks.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
541541
newtp->fastopen_req = NULL;
542542
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
543543

544+
newtp->bpf_chg_cc_inprogress = 0;
544545
tcp_bpf_clone(sk, newsk);
545546

546547
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);

tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,10 @@ static void test_dctcp_fallback(void)
290290
goto done;
291291
ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
292292
ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res");
293+
/* All setsockopt(TCP_CONGESTION) in the recurred
294+
* bpf_dctcp->init() should fail with -EBUSY.
295+
*/
296+
ASSERT_EQ(dctcp_skel->bss->ebusy_cnt, 3, "ebusy_cnt");
293297

294298
err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
295299
if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))

tools/testing/selftests/bpf/progs/bpf_dctcp.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/types.h>
1212
#include <linux/stddef.h>
1313
#include <linux/tcp.h>
14+
#include <errno.h>
1415
#include <bpf/bpf_helpers.h>
1516
#include <bpf/bpf_tracing.h>
1617
#include "bpf_tcp_helpers.h"
@@ -23,6 +24,7 @@ const char tcp_cdg[] = "cdg";
2324
char cc_res[TCP_CA_NAME_MAX];
2425
int tcp_cdg_res = 0;
2526
int stg_result = 0;
27+
int ebusy_cnt = 0;
2628

2729
struct {
2830
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
@@ -64,16 +66,23 @@ void BPF_PROG(dctcp_init, struct sock *sk)
6466

6567
if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
6668
/* Switch to fallback */
67-
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
68-
(void *)fallback, sizeof(fallback));
69-
/* Switch back to myself which the bpf trampoline
70-
* stopped calling dctcp_init recursively.
69+
if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
70+
(void *)fallback, sizeof(fallback)) == -EBUSY)
71+
ebusy_cnt++;
72+
73+
/* Switch back to myself and the recurred dctcp_init()
74+
* will get -EBUSY for all bpf_setsockopt(TCP_CONGESTION),
75+
* except the last "cdg" one.
7176
*/
72-
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
73-
(void *)bpf_dctcp, sizeof(bpf_dctcp));
77+
if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
78+
(void *)bpf_dctcp, sizeof(bpf_dctcp)) == -EBUSY)
79+
ebusy_cnt++;
80+
7481
/* Switch back to fallback */
75-
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
76-
(void *)fallback, sizeof(fallback));
82+
if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
83+
(void *)fallback, sizeof(fallback)) == -EBUSY)
84+
ebusy_cnt++;
85+
7786
/* Expecting -ENOTSUPP for tcp_cdg_res */
7887
tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
7988
(void *)tcp_cdg, sizeof(tcp_cdg));

0 commit comments

Comments
 (0)