From 9ea2e010db791024e710783270a7fdd265a51c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:33:05 +0100 Subject: [PATCH 01/93] ipvs: fix WARNING in __ip_vs_cleanup_batch() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-66488 cve CVE-2022-49918 commit-author Zhengchao Shao commit 3d00c6a0da8ddcf75213e004765e4a42acc71d5d During the initialization of ip_vs_conn_net_init(), if file ip_vs_conn or ip_vs_conn_sync fails to be created, the initialization is successful by default. Therefore, the ip_vs_conn or ip_vs_conn_sync file doesn't be found during the remove. The following is the stack information: name 'ip_vs_conn_sync' WARNING: CPU: 3 PID: 9 at fs/proc/generic.c:712 remove_proc_entry+0x389/0x460 Modules linked in: Workqueue: netns cleanup_net RIP: 0010:remove_proc_entry+0x389/0x460 Call Trace: __ip_vs_cleanup_batch+0x7d/0x120 ops_exit_list+0x125/0x170 cleanup_net+0x4ea/0xb00 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: 61b1ab4583e2 ("IPVS: netns, add basic init per netns.") Signed-off-by: Zhengchao Shao Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 3d00c6a0da8ddcf75213e004765e4a42acc71d5d) Signed-off-by: Marcin Wcisło --- net/netfilter/ipvs/ip_vs_conn.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index db13288fddfad..cb6d68220c265 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1447,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { atomic_set(&ipvs->conn_count, 0); - proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, - &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state)); - proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, - &ip_vs_conn_sync_seq_ops, - sizeof(struct ip_vs_iter_state)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, + &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn; + + if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn_sync; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +err_conn_sync: + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); +err_conn: + return -ENOMEM; +#endif } void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) { /* flush all the connection entries first */ ip_vs_conn_flush(ipvs); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); +#endif } int __init ip_vs_conn_init(void) From df071a18b5bee8f34573d14deba046b530454f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:34:35 +0100 Subject: [PATCH 02/93] ipvs: fix WARNING in ip_vs_app_net_cleanup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-66461 cve CVE-2022-49917 commit-author Zhengchao Shao commit 5663ed63adb9619c98ab7479aa4606fa9b7a548c During the initialization of ip_vs_app_net_init(), if file ip_vs_app fails to be created, the initialization is successful by default. Therefore, the ip_vs_app file doesn't be found during the remove in ip_vs_app_net_cleanup(). It will cause WRNING. The following is the stack information: name 'ip_vs_app' WARNING: CPU: 1 PID: 9 at fs/proc/generic.c:712 remove_proc_entry+0x389/0x460 Modules linked in: Workqueue: netns cleanup_net RIP: 0010:remove_proc_entry+0x389/0x460 Call Trace: ops_exit_list+0x125/0x170 cleanup_net+0x4ea/0xb00 process_one_work+0x9bf/0x1710 worker_thread+0x665/0x1080 kthread+0x2e4/0x3a0 ret_from_fork+0x1f/0x30 Fixes: 457c4cbc5a3d ("[NET]: Make /proc/net per network namespace") Signed-off-by: Zhengchao Shao Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 5663ed63adb9619c98ab7479aa4606fa9b7a548c) Signed-off-by: Marcin Wcisło --- net/netfilter/ipvs/ip_vs_app.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index f9b16f2b22191..fdacbc3c15bef 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->app_list); - proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops, - sizeof(struct seq_net_private)); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, + &ip_vs_app_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; +#endif return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { unregister_ip_vs_app(ipvs, NULL /* all */); +#ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_app", ipvs->net->proc_net); +#endif } From 83586e9c02040e0f7a2a86e0ad25b831a9657265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:34:49 +0100 Subject: [PATCH 03/93] netfilter: ipset: Rework long task execution when adding/deleting entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-157585 cve CVE-2023-53549 commit-author Jozsef Kadlecsik commit 5e29dc36bd5e2166b834ceb19990d9e68a734d7d When adding/deleting large number of elements in one step in ipset, it can take a reasonable amount of time and can result in soft lockup errors. The patch 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") tried to fix it by limiting the max elements to process at all. However it was not enough, it is still possible that we get hung tasks. Lowering the limit is not reasonable, so the approach in this patch is as follows: rely on the method used at resizing sets and save the state when we reach a smaller internal batch limit, unlock/lock and proceed from the saved state. Thus we can avoid long continuous tasks and at the same time removed the limit to add/delete large number of elements in one step. The nfnl mutex is held during the whole operation which prevents one to issue other ipset commands in parallel. Fixes: 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") Reported-by: syzbot+9204e7399656300bf271@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 5e29dc36bd5e2166b834ceb19990d9e68a734d7d) Signed-off-by: Marcin Wcisło --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_core.c | 7 ++++--- net/netfilter/ipset/ip_set_hash_ip.c | 14 ++++++------- net/netfilter/ipset/ip_set_hash_ipmark.c | 13 ++++++------ net/netfilter/ipset/ip_set_hash_ipport.c | 13 ++++++------ net/netfilter/ipset/ip_set_hash_ipportip.c | 13 ++++++------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 13 +++++++----- net/netfilter/ipset/ip_set_hash_net.c | 17 +++++++-------- net/netfilter/ipset/ip_set_hash_netiface.c | 15 ++++++-------- net/netfilter/ipset/ip_set_hash_netnet.c | 23 +++++++-------------- net/netfilter/ipset/ip_set_hash_netport.c | 19 +++++++---------- 11 files changed, 68 insertions(+), 81 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d50..72f5ebc5c97a9 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -197,7 +197,7 @@ struct ip_set_region { }; /* Max range where every element is added/deleted in one step */ -#define IPSET_MAX_RANGE (1<<20) +#define IPSET_MAX_RANGE (1<<14) /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 9a28c0a88aae9..2aad29dcd396f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1702,9 +1702,10 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; - } while (ret == -EAGAIN && - set->variant->resize && - (ret = set->variant->resize(set, retried)) == 0); + } while (ret == -ERANGE || + (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 75d556d71652d..24adcdd7a0b16 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -98,11 +98,11 @@ static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ip4 *h = set->data; + struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, hosts; + u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) @@ -147,14 +147,14 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - /* 64bit division is not allowed on 32bit */ - if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to;) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 153de3457423e..a22ec1a6f6ec8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -97,11 +97,11 @@ static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipmark4 *h = set->data; + struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0; + u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -148,13 +148,14 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ipmark4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7303138e46be1..10481760a9b25 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -105,11 +105,11 @@ static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipport4 *h = set->data; + struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -173,17 +173,18 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 334fb1ad0e86c..39a01934b1536 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -108,11 +108,11 @@ static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportip4 *h = set->data; + struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -180,17 +180,18 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipportip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7df94f437f600..5c6de605a9fb7 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -160,12 +160,12 @@ static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportnet4 *h = set->data; + struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -253,9 +253,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); @@ -282,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; + if (i > IPSET_MAX_RANGE) { + hash_ipportnet4_data_next(&h->next, + &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1422739d9aa25..ce0a9ce5a91f1 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -136,11 +136,11 @@ static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_net4 *h = set->data; + struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,19 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_net4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 9810f5bf63f5e..0310732862362 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,19 +256,16 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_netiface4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3d09eefe998a7..c07b70bf32db4 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -163,13 +163,12 @@ static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netnet4 *h = set->data; + struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; - u64 n = 0, m = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -245,19 +244,6 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -270,7 +256,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { + i++; e.ip[1] = htonl(ip2); + if (i > IPSET_MAX_RANGE) { + hash_netnet4_data_next(&h->next, &e); + return -ERANGE; + } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 09cf72eb37f8d..d1a0628df4ef3 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -154,12 +154,11 @@ static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netport4 *h = set->data; + struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; - u64 n = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -236,14 +235,6 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); - n++; - } while (ipn++ < ip_to); - - if (n*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip); @@ -255,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_netport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; From 586f749c79e60886b6df9c2d7846899360178cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:49:15 +0100 Subject: [PATCH 04/93] netfilter: br_netfilter: disable sabotage_in hook after first suppression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit 2b272bb558f1d3a5aa95ed8a82253786fd1a48ba When using a xfrm interface in a bridged setup (the outgoing device is bridged), the incoming packets in the xfrm interface are only tracked in the outgoing direction. $ brctl show bridge name interfaces br_eth1 eth1 $ conntrack -L tcp 115 SYN_SENT src=192... dst=192... [UNREPLIED] ... If br_netfilter is enabled, the first (encrypted) packet is received onR eth1, conntrack hooks are called from br_netfilter emulation which allocates nf_bridge info for this skb. If the packet is for local machine, skb gets passed up the ip stack. The skb passes through ip prerouting a second time. br_netfilter ip_sabotage_in supresses the re-invocation of the hooks. After this, skb gets decrypted in xfrm layer and appears in network stack a second time (after decryption). Then, ip_sabotage_in is called again and suppresses netfilter hook invocation, even though the bridge layer never called them for the plaintext incarnation of the packet. Free the bridge info after the first suppression to avoid this. I was unable to figure out where the regression comes from, as far as i can see br_netfilter always had this problem; i did not expect that skb is looped again with different headers. Fixes: c4b0e771f906 ("netfilter: avoid using skb->nf_bridge directly") Reported-and-tested-by: Wolfgang Nothdurft Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 2b272bb558f1d3a5aa95ed8a82253786fd1a48ba) Signed-off-by: Marcin Wcisło --- net/bridge/br_netfilter_hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff408..9554abcfd5b4e 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -871,6 +871,7 @@ static unsigned int ip_sabotage_in(void *priv, if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { + nf_bridge_info_free(skb); state->okfn(state->net, state->sk, skb); return NF_STOLEN; } From db982d7aabb293ce36f98eea5719bb3249dbfcba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:53:19 +0100 Subject: [PATCH 05/93] netfilter: br_netfilter: fix recent physdev match breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit 94623f579ce338b5fa61b5acaa5beb8aa657fb9e Recent attempt to ensure PREROUTING hook is executed again when a decrypted ipsec packet received on a bridge passes through the network stack a second time broke the physdev match in INPUT hook. We can't discard the nf_bridge info strct from sabotage_in hook, as this is needed by the physdev match. Keep the struct around and handle this with another conditional instead. Fixes: 2b272bb558f1 ("netfilter: br_netfilter: disable sabotage_in hook after first suppression") Reported-and-tested-by: Farid BENAMROUCHE Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 94623f579ce338b5fa61b5acaa5beb8aa657fb9e) Signed-off-by: Marcin Wcisło --- include/linux/skbuff.h | 1 + net/bridge/br_netfilter_hooks.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e9b9c0f30e558..343ce45bc35c7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -261,6 +261,7 @@ struct nf_bridge_info { u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; + u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9554abcfd5b4e..812bd7e1750b6 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv, { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge && !nf_bridge->in_prerouting && - !netif_is_l3_master(skb->dev) && - !netif_is_l3_slave(skb->dev)) { - nf_bridge_info_free(skb); - state->okfn(state->net, state->sk, skb); - return NF_STOLEN; + if (nf_bridge) { + if (nf_bridge->sabotage_in_done) + return NF_ACCEPT; + + if (!nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev) && + !netif_is_l3_slave(skb->dev)) { + nf_bridge->sabotage_in_done = 1; + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } } return NF_ACCEPT; From 2d193209df668ee8b386067f4b09df53fdb134a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:53:31 +0100 Subject: [PATCH 06/93] netfilter: ebtables: fix table blob use-after-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit e58a171d35e32e6e8c37cfe0e8a94406732a331f We are not allowed to return an error at this point. Looking at the code it looks like ret is always 0 at this point, but its not. t = find_table_lock(net, repl->name, &ret, &ebt_mutex); ... this can return a valid table, with ret != 0. This bug causes update of table->private with the new blob, but then frees the blob right away in the caller. Syzbot report: BUG: KASAN: vmalloc-out-of-bounds in __ebt_unregister_table+0xc00/0xcd0 net/bridge/netfilter/ebtables.c:1168 Read of size 4 at addr ffffc90005425000 by task kworker/u4:4/74 Workqueue: netns cleanup_net Call Trace: kasan_report+0xbf/0x1f0 mm/kasan/report.c:517 __ebt_unregister_table+0xc00/0xcd0 net/bridge/netfilter/ebtables.c:1168 ebt_unregister_table+0x35/0x40 net/bridge/netfilter/ebtables.c:1372 ops_exit_list+0xb0/0x170 net/core/net_namespace.c:169 cleanup_net+0x4ee/0xb10 net/core/net_namespace.c:613 ... ip(6)tables appears to be ok (ret should be 0 at this point) but make this more obvious. Fixes: c58dd2dd443c ("netfilter: Can't fail and free after table replacement") Reported-by: syzbot+f61594de72d6705aea03@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit e58a171d35e32e6e8c37cfe0e8a94406732a331f) Signed-off-by: Marcin Wcisło --- net/bridge/netfilter/ebtables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 3 +-- net/ipv6/netfilter/ip6_tables.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index efacd565a4e35..7231a1796f772 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1053,7 +1053,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REPLACE, GFP_KERNEL); - return ret; + return 0; free_unlock: mutex_unlock(&ebt_mutex); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index dd2cbe74a810a..56973719e07fc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1044,7 +1044,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_counters *counters; struct ipt_entry *iter; - ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; @@ -1090,7 +1089,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); - return ret; + return 0; put_module: module_put(t->me); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 728b995561615..33b086d1c578d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1061,7 +1061,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_counters *counters; struct ip6t_entry *iter; - ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; @@ -1107,7 +1106,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); - return ret; + return 0; put_module: module_put(t->me); From 5d2a3ac95a987e09d50946bb3f50cc949e9f2a87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:53:52 +0100 Subject: [PATCH 07/93] netfilter: ebtables: fix memory leak when blob is malformed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-34024 cve CVE-2022-48641 commit-author Florian Westphal commit 62ce44c4fff947eebdf10bb582267e686e6835c9 The bug fix was incomplete, it "replaced" crash with a memory leak. The old code had an assignment to "ret" embedded into the conditional, restore this. Fixes: 7997eff82828 ("netfilter: ebtables: reject blobs that don't provide all entry points") Reported-and-tested-by: syzbot+a24c5252f3e3ab733464@syzkaller.appspotmail.com Signed-off-by: Florian Westphal (cherry picked from commit 62ce44c4fff947eebdf10bb582267e686e6835c9) Signed-off-by: Marcin Wcisło --- net/bridge/netfilter/ebtables.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 7231a1796f772..0f863762131c5 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1005,8 +1005,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - if (repl->valid_hooks != t->valid_hooks) + if (repl->valid_hooks != t->valid_hooks) { + ret = -EINVAL; goto free_unlock; + } if (repl->num_counters && repl->num_counters != t->private->nentries) { ret = -EINVAL; From ff339464b846fa22d2fc88a7222cbde454e59bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 19:55:00 +0100 Subject: [PATCH 08/93] netfilter: tproxy: fix deadlock due to missing BH disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit 4a02426787bf024dafdb79b362285ee325de3f5e The xtables packet traverser performs an unconditional local_bh_disable(), but the nf_tables evaluation loop does not. Functions that are called from either xtables or nftables must assume that they can be called in process context. inet_twsk_deschedule_put() assumes that no softirq interrupt can occur. If tproxy is used from nf_tables its possible that we'll deadlock trying to aquire a lock already held in process context. Add a small helper that takes care of this and use it. Link: https://lore.kernel.org/netfilter-devel/401bd6ed-314a-a196-1cdc-e13c720cc8f2@balasys.hu/ Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Reported-and-tested-by: Major Dávid Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 4a02426787bf024dafdb79b362285ee325de3f5e) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tproxy.h | 7 +++++++ net/ipv4/netfilter/nf_tproxy_ipv4.c | 2 +- net/ipv6/netfilter/nf_tproxy_ipv6.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 82d0e41b76f22..faa108b1ba675 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -17,6 +17,13 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) return false; } +static inline void nf_tproxy_twsk_deschedule_put(struct inet_timewait_sock *tw) +{ + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); +} + /* assign a socket to the skb -- consumes sk */ static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) { diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b22b2c745c76c..69e3317996043 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -38,7 +38,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); + nf_tproxy_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 929502e51203b..52f828bb5a83d 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -63,7 +63,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); + nf_tproxy_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } From c01bdb1492293366db7b5859909749e65bad7545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:02:46 +0100 Subject: [PATCH 09/93] netfilter: nf_tables: do not set up extensions for end interval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 33c7aba0b4ffd6d7cdab862a034eb582a5120a38 upstream-diff Actually picked the 9.4 backport c08553147be944db3bbb48d89aaca5cff675cd8e which applies cleanly without context conflicts Elements with an end interval flag set on do not store extensions. The global set definition is currently setting on the timeout and stateful expression for end interval elements. This leads to skipping end interval elements from the set->ops->walk() path as the expired check bogusly reports true. Moreover, do not set up stateful expressions for elements with end interval flag set on since this is never used. Fixes: 65038428b2c6 ("netfilter: nf_tables: allow to specify stateful expression in set definition") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 33c7aba0b4ffd6d7cdab862a034eb582a5120a38) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5264437d3b89b..cc6d3d3b7d2f0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6075,7 +6075,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &timeout); if (err) return err; - } else if (set->flags & NFT_SET_TIMEOUT) { + } else if (set->flags & NFT_SET_TIMEOUT && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = READ_ONCE(set->timeout); } @@ -6141,7 +6142,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EOPNOTSUPP; goto err_set_elem_expr; } - } else if (set->num_exprs > 0) { + } else if (set->num_exprs > 0 && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; From 2b3e14a014eeb9774b9dd0938df118e3bcd3621f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:07:22 +0100 Subject: [PATCH 10/93] netfilter: nft_set_rbtree: Switch to node list walk for overlap detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit c9e6978e2725a7d4b6cd23b2facd3f11422c0643 ...instead of a tree descent, which became overly complicated in an attempt to cover cases where expired or inactive elements would affect comparisons with the new element being inserted. Further, it turned out that it's probably impossible to cover all those cases, as inactive nodes might entirely hide subtrees consisting of a complete interval plus a node that makes the current insertion not overlap. To speed up the overlap check, descent the tree to find a greater element that is closer to the key value to insert. Then walk down the node list for overlap detection. Starting the overlap check from rb_first() unconditionally is slow, it takes 10 times longer due to the full linear traversal of the list. Moreover, perform garbage collection of expired elements when walking down the node list to avoid bogus overlap reports. For the insertion operation itself, this essentially reverts back to the implementation before commit 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion"), except that cases of complete overlap are already handled in the overlap detection phase itself, which slightly simplifies the loop to find the insertion point. Based on initial patch from Stefano Brivio, including text from the original patch description too. Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso (cherry picked from commit c9e6978e2725a7d4b6cd23b2facd3f11422c0643) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 316 ++++++++++++++++++++------------- 1 file changed, 189 insertions(+), 127 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7325bee7d1442..217225e13faf7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct nft_rbtree_elem *rbe_prev; + struct nft_set_gc_batch *gcb; + + gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); + if (!gcb) + return -ENOMEM; + + /* search for expired end interval coming before this element. */ + do { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev)) + break; + + prev = rb_prev(prev); + } while (prev != NULL); + + rb_erase(&rbe_prev->node, &priv->root); + rb_erase(&rbe->node, &priv->root); + atomic_sub(2, &set->nelems); + + nft_set_gc_batch_add(gcb, rbe); + nft_set_gc_batch_complete(gcb); + + return 0; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { - bool overlap = false, dup_end_left = false, dup_end_right = false; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - int d; + int d, err; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) - * '--' no nodes falling in this range - * b4. >|_ _ ! (insert start before existing start) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found immediately to the left. If - * there are existing nodes in between, we need to further descend the - * tree before we can conclude the new start isn't causing an overlap - * - * or to b4., which, preceded by a3., means we already traversed one or - * more existing intervals entirely, from the right. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) - * b6. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. - * - * Another special case comes from the fact that start elements matching - * an already existing start element are allowed: insertion is not - * performed but we return -EEXIST in that case, and the error will be - * cleared by the caller if NLM_F_EXCL is not present in the request. - * This way, request for insertion of an exact overlap isn't reported as - * error to userspace if not desired. - * - * However, if the existing start matches a pre-existing start, but the - * end element doesn't match the corresponding pre-existing end element, - * we need to report a partial overlap. This is a local condition that - * can be noticed without need for a tracking flag, by checking for a - * local duplicated end for a corresponding start, from left and right, - * separately. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext) && !*p) - overlap = false; - } else { - if (dup_end_left && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_right = true; - continue; - } - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - if (dup_end_right && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_left = true; - continue; - } - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - overlap = nft_rbtree_interval_end(rbe); - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - overlap = false; - if (nft_rbtree_interval_end(rbe)) - p = &parent->rb_left; - else - p = &parent->rb_right; + /* perform garbage collection to avoid bogus overlap reports. */ + if (nft_set_elem_expired(&rbe->ext)) { + err = nft_rbtree_gc_elem(set, priv, rbe); + if (err < 0) + return err; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; + } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } + } - dup_end_left = dup_end_right = false; + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; + } + + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; } - if (overlap) + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; From 530707eb52b91efddb311c05181d796d121cc5ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:25:25 +0100 Subject: [PATCH 11/93] netfilter: nft_set_rbtree: skip elements in transaction from garbage collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 5d235d6ce75c12a7fdee375eb211e4116f7ab01b Skip interference with an ongoing transaction, do not perform garbage collection on inactive elements. Reset annotated previous end interval if the expired element is marked as busy (control plane removed the element right before expiration). Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 5d235d6ce75c12a7fdee375eb211e4116f7ab01b) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 217225e13faf7..19ea4d3c35535 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -563,23 +563,37 @@ static void nft_rbtree_gc(struct work_struct *work) struct nft_rbtree *priv; struct rb_node *node; struct nft_set *set; + struct net *net; + u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + genmask = nft_genmask_cur(net); write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + if (nft_set_elem_mark_busy(&rbe->ext)) { + rbe_end = NULL; continue; + } if (rbe_prev) { rb_erase(&rbe_prev->node, &priv->root); From 75e19ce51c72d42ddc223935bcb09adaa3c6b190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:25:49 +0100 Subject: [PATCH 12/93] netfilter: nf_tables: allow to fetch set elements when table has an owner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 92f3e96d642f5e05b9dc710c06fedc669f1b4f00 NFT_MSG_GETSETELEM returns -EPERM when fetching set elements that belong to table that has an owner. This results in empty set/map listing from userspace. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 92f3e96d642f5e05b9dc710c06fedc669f1b4f00) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cc6d3d3b7d2f0..ca2ff3687b559 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5525,7 +5525,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, NETLINK_CB(skb).portid); + genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); From 709f6c7b4acaac23cd9f11c6d3f7cea70eb679ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:26:25 +0100 Subject: [PATCH 13/93] netfilter: ctnetlink: fix possible refcount leak in ctnetlink_create_conntrack() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-6817 cve CVE-2023-7192 commit-author Hangyu Hua commit ac4893980bbe79ce383daf9a0885666a30fe4c83 nf_ct_put() needs to be called to put the refcount got by nf_conntrack_find_get() to avoid refcount leak when nf_conntrack_hash_check_insert() fails. Fixes: 7d367e06688d ("netfilter: ctnetlink: fix soft lockup when netlink adds new entries (v2)") Signed-off-by: Hangyu Hua Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit ac4893980bbe79ce383daf9a0885666a30fe4c83) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04169b54f2a2b..e8688a4b7ce72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2375,12 +2375,15 @@ ctnetlink_create_conntrack(struct net *net, err = nf_conntrack_hash_check_insert(ct); if (err < 0) - goto err2; + goto err3; rcu_read_unlock(); return ct; +err3: + if (ct->master) + nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: From 47d4b36940aaf652e52377d4f59fa11874a9e25a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:27:11 +0100 Subject: [PATCH 14/93] netfilter: conntrack: fix rmmod double-free race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit e6d57e9ff0aec323717ee36fc9ea34ad89217151 nf_conntrack_hash_check_insert() callers free the ct entry directly, via nf_conntrack_free. This isn't safe anymore because nf_conntrack_hash_check_insert() might place the entry into the conntrack table and then delteted the entry again because it found that a conntrack extension has been removed at the same time. In this case, the just-added entry is removed again and an error is returned to the caller. Problem is that another cpu might have picked up this entry and incremented its reference count. This results in a use-after-free/double-free, once by the other cpu and once by the caller of nf_conntrack_hash_check_insert(). Fix this by making nf_conntrack_hash_check_insert() not fail anymore after the insertion, just like before the 'Fixes' commit. This is safe because a racing nf_ct_iterate() has to wait for us to release the conntrack hash spinlocks. While at it, make the function return -EAGAIN in the rmmod (genid changed) case, this makes nfnetlink replay the command (suggested by Pablo Neira). Fixes: c56716c69ce1 ("netfilter: extensions: introduce extension genid count") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit e6d57e9ff0aec323717ee36fc9ea34ad89217151) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_core.c | 25 +++++++++++++++---------- net/netfilter/nf_conntrack_netlink.c | 3 --- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e8843c397b0b7..32c512378ea92 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -890,10 +890,8 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone = nf_ct_zone(ct); - if (!nf_ct_ext_valid_pre(ct->ext)) { - NF_CT_STAT_INC_ATOMIC(net, insert_failed); - return -ETIMEDOUT; - } + if (!nf_ct_ext_valid_pre(ct->ext)) + return -EAGAIN; local_bh_disable(); do { @@ -928,6 +926,19 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto chaintoolong; } + /* If genid has changed, we can't insert anymore because ct + * extensions could have stale pointers and nf_ct_iterate_destroy + * might have completed its table scan already. + * + * Increment of the ext genid right after this check is fine: + * nf_ct_iterate_destroy blocks until locks are released. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + err = -EAGAIN; + goto out; + } + + ct->status |= IPS_CONFIRMED; smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); @@ -936,12 +947,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); - if (!nf_ct_ext_valid_post(ct->ext)) { - nf_ct_kill(ct); - NF_CT_STAT_INC_ATOMIC(net, drop); - return -ETIMEDOUT; - } - return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e8688a4b7ce72..d36d6786c3bbe 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2316,9 +2316,6 @@ ctnetlink_create_conntrack(struct net *net, nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); - /* we must add conntrack extensions before confirmation. */ - ct->status |= IPS_CONFIRMED; - if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From a9e9413dd338eec54cfa99b07184a4bf941893ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:27:30 +0100 Subject: [PATCH 15/93] netfilter: conntrack: restore IPS_CONFIRMED out of nf_conntrack_hash_check_insert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 2cdaa3eefed83082923cf219c8b6a314e622da74 e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") consolidates IPS_CONFIRMED bit set in nf_conntrack_hash_check_insert(). However, this breaks ctnetlink: # conntrack -I -p tcp --timeout 123 --src 1.2.3.4 --dst 5.6.7.8 --state ESTABLISHED --sport 1 --dport 4 -u SEEN_REPLY conntrack v1.4.6 (conntrack-tools): Operation failed: Device or resource busy This is a partial revert of the aforementioned commit to restore IPS_CONFIRMED. Fixes: e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") Reported-by: Stéphane Graber Tested-by: Stéphane Graber Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 2cdaa3eefed83082923cf219c8b6a314e622da74) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_bpf.c | 1 + net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_netlink.c | 3 +++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 1cd87b28c9b05..2bc6b9bfbf95e 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -339,6 +339,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 32c512378ea92..98d3c64380f07 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -938,7 +938,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto out; } - ct->status |= IPS_CONFIRMED; smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d36d6786c3bbe..e8688a4b7ce72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2316,6 +2316,9 @@ ctnetlink_create_conntrack(struct net *net, nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); + /* we must add conntrack extensions before confirmation. */ + ct->status |= IPS_CONFIRMED; + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From 5a452bf3dac41eaa796564b5beba9d94a4dcfb3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:37 +0100 Subject: [PATCH 16/93] netfilter: conntrack: adopt safer max chain length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Eric Dumazet commit c77737b736ceb50fdf150434347dbd81ec76dbb1 Customers using GKE 1.25 and 1.26 are facing conntrack issues root caused to commit c9c3b6811f74 ("netfilter: conntrack: make max chain length random"). Even if we assume Uniform Hashing, a bucket often reachs 8 chained items while the load factor of the hash table is smaller than 0.5 With a limit of 16, we reach load factors of 3. With a limit of 32, we reach load factors of 11. With a limit of 40, we reach load factors of 15. With a limit of 50, we reach load factors of 24. This patch changes MIN_CHAINLEN to 50, to minimize risks. Ideally, we could in the future add a cushion based on expected load factor (2 * nf_conntrack_max / nf_conntrack_buckets), because some setups might expect unusual values. Fixes: c9c3b6811f74 ("netfilter: conntrack: make max chain length random") Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso (cherry picked from commit c77737b736ceb50fdf150434347dbd81ec76dbb1) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 98d3c64380f07..a320b1c77bb11 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -96,8 +96,8 @@ static DEFINE_MUTEX(nf_conntrack_mutex); #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) -#define MIN_CHAINLEN 8u -#define MAX_CHAINLEN (32u - MIN_CHAINLEN) +#define MIN_CHAINLEN 50u +#define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; From 688e12d015d9b2312387fd853aba4710cd30fe45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:39 +0100 Subject: [PATCH 17/93] netfilter: nft_nat: correct length for loading protocol registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Jeremy Sowden commit 068d82e75d537b444303b8c449a11e51ea659565 The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 068d82e75d537b444303b8c449a11e51ea659565) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_nat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index db8f9116eeb43..cd4eb4996aff3 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -226,7 +226,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_MAP_IPS; } - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); From 95a0cd87df78e7fc43cc480e82f2117240201d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:40 +0100 Subject: [PATCH 18/93] netfilter: nft_masq: correct length for loading protocol registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Jeremy Sowden commit ec2c5917eb858428b2083d1c74f445aabbe8316b The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: 8a6bf5da1aef ("netfilter: nft_masq: support port range") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit ec2c5917eb858428b2083d1c74f445aabbe8316b) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_masq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 9953e80537536..1818dbf089cad 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); struct nft_masq *priv = nft_expr_priv(expr); int err; From daacca311dd863574bb4f3c11a76961d08bb3e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:42 +0100 Subject: [PATCH 19/93] netfilter: nft_redir: correct length for loading protocol registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Jeremy Sowden commit 1f617b6b4c7a3d5ea7a56abb83a4c27733b60c2f The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 1f617b6b4c7a3d5ea7a56abb83a4c27733b60c2f) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_redir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index ba09890dddb50..deb7e65c8d82b 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); From 331c4c4551dee186d90d98d0cf4dcbe10c928ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:44 +0100 Subject: [PATCH 20/93] netfilter: nft_redir: correct value of inet type `.maxattrs` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Jeremy Sowden commit 493924519b1fe3faab13ee621a43b0d0939abab1 `nft_redir_inet_type.maxattrs` was being set, presumably because of a cut-and-paste error, to `NFTA_MASQ_MAX`, instead of `NFTA_REDIR_MAX`. Fixes: 63ce3940f3ab ("netfilter: nft_redir: add inet support") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 493924519b1fe3faab13ee621a43b0d0939abab1) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_redir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index deb7e65c8d82b..e64f531d66cfc 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -232,7 +232,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; From c673ff1a51e758058e4c81fe22ae231eee96ea29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:30:50 +0100 Subject: [PATCH 21/93] netfilter: nf_tables: tighten netlink attribute requirements for catch-all elements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit d4eb7e39929a3b1ff30fb751b4859fc2410702a0 If NFT_SET_ELEM_CATCHALL is set on, then userspace provides no set element key. Otherwise, bail out with -EINVAL. Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit d4eb7e39929a3b1ff30fb751b4859fc2410702a0) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ca2ff3687b559..95506c2ab683a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6026,7 +6026,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { From 0787373ebc4dc5630d29725411ddc45c896500ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:31:12 +0100 Subject: [PATCH 22/93] netfilter: conntrack: fix wrong ct->timeout value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-158282 cve CVE-2023-53635 commit-author Tzung-Bi Shih commit 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1 (struct nf_conn)->timeout is an interval before the conntrack confirmed. After confirmed, it becomes a timestamp. It is observed that timeout of an unconfirmed conntrack: - Set by calling ctnetlink_change_timeout(). As a result, `nfct_time_stamp` was wrongly added to `ct->timeout` twice. - Get by calling ctnetlink_dump_timeout(). As a result, `nfct_time_stamp` was wrongly subtracted. Call Trace: dump_stack_lvl ctnetlink_dump_timeout __ctnetlink_glue_build ctnetlink_glue_build __nfqnl_enqueue_packet nf_queue nf_hook_slow ip_mc_output ? __pfx_ip_finish_output ip_send_skb ? __pfx_dst_output udp_send_skb udp_sendmsg ? __pfx_ip_generic_getfrag sock_sendmsg Separate the 2 cases in: - Setting `ct->timeout` in __nf_ct_set_timeout(). - Getting `ct->timeout` in ctnetlink_dump_timeout(). Pablo appends: Update ctnetlink to set up the timeout _after_ the IPS_CONFIRMED flag is set on, otherwise conntrack creation via ctnetlink breaks. Note that the problem described in this patch occurs since the introduction of the nfnetlink_queue conntrack support, select a sufficiently old Fixes: tag for -stable kernel to pick up this fix. Fixes: a4b4766c3ceb ("netfilter: nfnetlink_queue: rename related to nfqueue attaching conntrack info") Signed-off-by: Tzung-Bi Shih Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_conntrack_core.h | 6 +++++- net/netfilter/nf_conntrack_netlink.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3cd3a6e631aa5..ff315e3ae75fc 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -94,7 +94,11 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) timeout = INT_MAX; - WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); + + if (nf_ct_is_confirmed(ct)) + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); + else + ct->timeout = (u32)timeout; } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e8688a4b7ce72..c5a7668535db2 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -176,7 +176,12 @@ static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { - long timeout = nf_ct_expires(ct) / HZ; + long timeout; + + if (nf_ct_is_confirmed(ct)) + timeout = nf_ct_expires(ct) / HZ; + else + timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; @@ -2248,9 +2253,6 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - __nf_ct_set_timeout(ct, timeout); - rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; @@ -2319,6 +2321,9 @@ ctnetlink_create_conntrack(struct net *net, /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + __nf_ct_set_timeout(ct, timeout); + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From f10e9fb366e1e192f9bec3dedf23c3b69b38f8a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:31:28 +0100 Subject: [PATCH 23/93] netfilter: nf_tables: don't write table validation state without mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit 9a32e9850686599ed194ccdceb6cd3dd56b2d9b9 The ->cleanup callback needs to be removed, this doesn't work anymore as the transaction mutex is already released in the ->abort function. Just do it after a successful validation pass, this either happens from commit or abort phases where transaction mutex is held. Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 9a32e9850686599ed194ccdceb6cd3dd56b2d9b9) Signed-off-by: Marcin Wcisło --- include/linux/netfilter/nfnetlink.h | 1 - net/netfilter/nf_tables_api.c | 8 ++------ net/netfilter/nfnetlink.c | 2 -- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 241e005f290ad..e9a9ab34a7ccc 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -45,7 +45,6 @@ struct nfnetlink_subsystem { int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); - void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 95506c2ab683a..15ebda32798e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8479,6 +8479,8 @@ static int nf_tables_validate(struct net *net) if (nft_table_validate(net, table) < 0) return -EAGAIN; } + + nft_validate_state_update(net, NFT_VALIDATE_SKIP); break; } @@ -9354,11 +9356,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) return 0; } -static void nf_tables_cleanup(struct net *net) -{ - nft_validate_state_update(net, NFT_VALIDATE_SKIP); -} - static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { @@ -9392,7 +9389,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, - .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 82976e4df95f9..b65d7f68d168b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -589,8 +589,6 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, goto replay_abort; } } - if (ss->cleanup) - ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); kfree_skb(skb); From 49328724f7df768e120ed668e33d8e9c5533bf0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:31:32 +0100 Subject: [PATCH 24/93] netfilter: nf_tables: fix nft_trans type confusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit e3c361b8acd636f5fe80c02849ca175201edf10c nft_trans_FOO objects all share a common nft_trans base structure, but trailing fields depend on the real object size. Access is only safe after trans->msg_type check. Check for rule type first. Found by code inspection. Fixes: 1a94e38d254b ("netfilter: nf_tables: add NFTA_RULE_ID attribute") Signed-off-by: Florian Westphal (cherry picked from commit e3c361b8acd636f5fe80c02849ca175201edf10c) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 15ebda32798e7..1ae9e16055d62 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3689,12 +3689,10 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { - struct nft_rule *rule = nft_trans_rule(trans); - if (trans->msg_type == NFT_MSG_NEWRULE && trans->ctx.chain == chain && id == nft_trans_rule_id(trans)) - return rule; + return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } From 15a0b41a5bd6e48e1e8282ca15743811d191bae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:31:37 +0100 Subject: [PATCH 25/93] netfilter: conntrack: fix NULL pointer dereference in nf_confirm_cthelper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Tijs Van Buggenhout commit e1f543dc660b44618a1bd72ddb4ca0828a95f7ad An nf_conntrack_helper from nf_conn_help may become NULL after DNAT. Observed when TCP port 1720 (Q931_PORT), associated with h323 conntrack helper, is DNAT'ed to another destination port (e.g. 1730), while nfqueue is being used for final acceptance (e.g. snort). This happenned after transition from kernel 4.14 to 5.10.161. Workarounds: * keep the same port (1720) in DNAT * disable nfqueue * disable/unload h323 NAT helper $ linux-5.10/scripts/decode_stacktrace.sh vmlinux < /tmp/kernel.log BUG: kernel NULL pointer dereference, address: 0000000000000084 [..] RIP: 0010:nf_conntrack_update (net/netfilter/nf_conntrack_core.c:2080 net/netfilter/nf_conntrack_core.c:2134) nf_conntrack [..] nfqnl_reinject (net/netfilter/nfnetlink_queue.c:237) nfnetlink_queue nfqnl_recv_verdict (net/netfilter/nfnetlink_queue.c:1230) nfnetlink_queue nfnetlink_rcv_msg (net/netfilter/nfnetlink.c:241) nfnetlink [..] Fixes: ee04805ff54a ("netfilter: conntrack: make conntrack userspace helpers work again") Signed-off-by: Tijs Van Buggenhout Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit e1f543dc660b44618a1bd72ddb4ca0828a95f7ad) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a320b1c77bb11..e9f43f320221c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2295,6 +2295,9 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, return 0; helper = rcu_dereference(help->helper); + if (!helper) + return 0; + if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) return 0; From 996ccaa06c0a88c99dbda3426af658c0467bc14f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:32:47 +0100 Subject: [PATCH 26/93] netfilter: nf_tables: Add null check for nla_nest_start_noflag() in nft_dump_basechain_hook() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Gavrilov Ilia commit bd058763a624a1fb5c20f3c46e632d623c043676 upstream-diff Used the cleanly applying 9.4 backport 04580de505708123813c20c2046bfffd2fbf7b0d The nla_nest_start_noflag() function may fail and return NULL; the return value needs to be checked. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: d54725cd11a5 ("netfilter: nf_tables: support for multiple devices per netdev hook") Signed-off-by: Gavrilov Ilia Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit bd058763a624a1fb5c20f3c46e632d623c043676) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1ae9e16055d62..dc17f48684fe5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1600,6 +1600,10 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (nft_base_chain_netdev(family, ops->hooknum)) { nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + + if (!nest_devs) + goto nla_put_failure; + list_for_each_entry(hook, &basechain->hook_list, list) { if (!first) first = hook; From 4202b1844d1af123317d2fce4dc01ea69e9a8fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:37:39 +0100 Subject: [PATCH 27/93] netfilter: nfnetlink: skip error delivery on batch in case of ENOMEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit a1a64a151dae8ac3581c1cbde44b672045cb658b If caller reports ENOMEM, then stop iterating over the batch and send a single netlink message to userspace to report OOM. Fixes: cbb8125eb40b ("netfilter: nfnetlink: deliver netlink errors on batch completion") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit a1a64a151dae8ac3581c1cbde44b672045cb658b) Signed-off-by: Marcin Wcisło --- net/netfilter/nfnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index b65d7f68d168b..5c001be6825e3 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -532,7 +532,8 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, * processed, this avoids that the same error is * reported several times when replaying the batch. */ - if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) { + if (err == -ENOMEM || + nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. From 2c3697bcd93e93427093d13e3579b5709b5b10d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:37:43 +0100 Subject: [PATCH 28/93] netfilter: nf_tables: always release netdev hooks from notifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit dc1c9fd4a8bbe1e06add9053010b652449bfe411 This reverts "netfilter: nf_tables: skip netdev events generated on netns removal". The problem is that when a veth device is released, the veth release callback will also queue the peer netns device for removal. Its possible that the peer netns is also slated for removal. In this case, the device memory is already released before the pre_exit hook of the peer netns runs: BUG: KASAN: slab-use-after-free in nf_hook_entry_head+0x1b8/0x1d0 Read of size 8 at addr ffff88812c0124f0 by task kworker/u8:1/45 Workqueue: netns cleanup_net Call Trace: nf_hook_entry_head+0x1b8/0x1d0 __nf_unregister_net_hook+0x76/0x510 nft_netdev_unregister_hooks+0xa0/0x220 __nft_release_hook+0x184/0x490 nf_tables_pre_exit_net+0x12f/0x1b0 .. Order is: 1. First netns is released, veth_dellink() queues peer netns device for removal 2. peer netns is queued for removal 3. peer netns device is released, unreg event is triggered 4. unreg event is ignored because netns is going down 5. pre_exit hook calls nft_netdev_unregister_hooks but device memory might be free'd already. Fixes: 68a3765c659f ("netfilter: nf_tables: skip netdev events generated on netns removal") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit dc1c9fd4a8bbe1e06add9053010b652449bfe411) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_chain_filter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c3563f0be2692..680fe557686e4 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,6 +344,12 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ __nft_release_basechain(ctx); } @@ -362,9 +368,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - if (!check_net(ctx.net)) - return NOTIFY_DONE; - nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { From 4d74dd2f9ed30c44783ae6a415c640fa56bc287a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:38:13 +0100 Subject: [PATCH 29/93] netfilter: nft_set_rbtree: fix null deref on element insertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-157645 cve CVE-2023-53566 commit-author Florian Westphal commit 61ae320a29b0540c16931816299eb86bf2b66c08 There is no guarantee that rb_prev() will not return NULL in nft_rbtree_gc_elem(): general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] nft_add_set_elem+0x14b0/0x2990 nf_tables_newsetelem+0x528/0xb30 Furthermore, there is a possible use-after-free while iterating, 'node' can be free'd so we need to cache the next value to use. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal (cherry picked from commit 61ae320a29b0540c16931816299eb86bf2b66c08) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 19ea4d3c35535..2f114aa10f1a7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -221,7 +221,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); - struct nft_rbtree_elem *rbe_prev; + struct nft_rbtree_elem *rbe_prev = NULL; struct nft_set_gc_batch *gcb; gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); @@ -229,17 +229,21 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, return -ENOMEM; /* search for expired end interval coming before this element. */ - do { + while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev)) break; prev = rb_prev(prev); - } while (prev != NULL); + } + + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + atomic_dec(&set->nelems); + } - rb_erase(&rbe_prev->node, &priv->root); rb_erase(&rbe->node, &priv->root); - atomic_sub(2, &set->nelems); + atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); nft_set_gc_batch_complete(gcb); @@ -268,7 +272,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; - struct rb_node *node, *parent, **p, *first = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); int d, err; @@ -307,7 +311,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ - for (node = first; node != NULL; node = rb_next(node)) { + for (node = first; node != NULL; node = next) { + next = rb_next(node); + rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) From 6e0ea296898b9a6e930297d9cf9f864ef831753f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:38:31 +0100 Subject: [PATCH 30/93] netfilter: ipset: Add schedule point in call_ad(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Kuniyuki Iwashima commit 24e227896bbf003165e006732dccb3516f87f88e syzkaller found a repro that causes Hung Task [0] with ipset. The repro first creates an ipset and then tries to delete a large number of IPs from the ipset concurrently: IPSET_ATTR_IPADDR_IPV4 : 172.20.20.187 IPSET_ATTR_CIDR : 2 The first deleting thread hogs a CPU with nfnl_lock(NFNL_SUBSYS_IPSET) held, and other threads wait for it to be released. Previously, the same issue existed in set->variant->uadt() that could run so long under ip_set_lock(set). Commit 5e29dc36bd5e ("netfilter: ipset: Rework long task execution when adding/deleting entries") tried to fix it, but the issue still exists in the caller with another mutex. While adding/deleting many IPs, we should release the CPU periodically to prevent someone from abusing ipset to hang the system. Note we need to increment the ipset's refcnt to prevent the ipset from being destroyed while rescheduling. [0]: INFO: task syz-executor174:268 blocked for more than 143 seconds. Not tainted 6.4.0-rc1-00145-gba79e9a73284 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor174 state:D stack:0 pid:268 ppid:260 flags:0x0000000d Call trace: __switch_to+0x308/0x714 arch/arm64/kernel/process.c:556 context_switch kernel/sched/core.c:5343 [inline] __schedule+0xd84/0x1648 kernel/sched/core.c:6669 schedule+0xf0/0x214 kernel/sched/core.c:6745 schedule_preempt_disabled+0x58/0xf0 kernel/sched/core.c:6804 __mutex_lock_common kernel/locking/mutex.c:679 [inline] __mutex_lock+0x6fc/0xdb0 kernel/locking/mutex.c:747 __mutex_lock_slowpath+0x14/0x20 kernel/locking/mutex.c:1035 mutex_lock+0x98/0xf0 kernel/locking/mutex.c:286 nfnl_lock net/netfilter/nfnetlink.c:98 [inline] nfnetlink_rcv_msg+0x480/0x70c net/netfilter/nfnetlink.c:295 netlink_rcv_skb+0x1c0/0x350 net/netlink/af_netlink.c:2546 nfnetlink_rcv+0x18c/0x199c net/netfilter/nfnetlink.c:658 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x664/0x8cc net/netlink/af_netlink.c:1365 netlink_sendmsg+0x6d0/0xa4c net/netlink/af_netlink.c:1913 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] ____sys_sendmsg+0x4b8/0x810 net/socket.c:2503 ___sys_sendmsg net/socket.c:2557 [inline] __sys_sendmsg+0x1f8/0x2a4 net/socket.c:2586 __do_sys_sendmsg net/socket.c:2595 [inline] __se_sys_sendmsg net/socket.c:2593 [inline] __arm64_sys_sendmsg+0x80/0x94 net/socket.c:2593 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall+0x84/0x270 arch/arm64/kernel/syscall.c:52 el0_svc_common+0x134/0x24c arch/arm64/kernel/syscall.c:142 do_el0_svc+0x64/0x198 arch/arm64/kernel/syscall.c:193 el0_svc+0x2c/0x7c arch/arm64/kernel/entry-common.c:637 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:655 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Reported-by: syzkaller Fixes: a7b4f989a629 ("netfilter: ipset: IP set core support") Signed-off-by: Kuniyuki Iwashima Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 24e227896bbf003165e006732dccb3516f87f88e) Signed-off-by: Marcin Wcisło --- net/netfilter/ipset/ip_set_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2aad29dcd396f..b225e2752169f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1698,6 +1698,14 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { + if (retried) { + __ip_set_get(set); + nfnl_unlock(NFNL_SUBSYS_IPSET); + cond_resched(); + nfnl_lock(NFNL_SUBSYS_IPSET); + __ip_set_put(set); + } + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); From fa1abba41f67e1292e330d7766130b9814568e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:39:33 +0100 Subject: [PATCH 31/93] netfilter: nf_tables: Extend nft_expr_ops::dump callback parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Phil Sutter commit 7d34aa3e03b6a56306296bd98b26c6a1710cd57b upstream-diff Used the cleanly applying 9.4 backport 55895d03af8b5ffdd189f0f26c688049439862c6 Add a 'reset' flag just like with nft_object_ops::dump. This will be useful to reset "anonymous stateful objects", e.g. simple rule counters. No functional change intended. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 7d34aa3e03b6a56306296bd98b26c6a1710cd57b) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 3 ++- include/net/netfilter/nft_fib.h | 2 +- include/net/netfilter/nft_meta.h | 4 ++-- include/net/netfilter/nft_reject.h | 3 ++- net/ipv4/netfilter/nft_dup_ipv4.c | 3 ++- net/ipv6/netfilter/nft_dup_ipv6.c | 3 ++- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_bitwise.c | 6 ++++-- net/netfilter/nft_byteorder.c | 3 ++- net/netfilter/nft_cmp.c | 6 ++++-- net/netfilter/nft_compat.c | 9 ++++++--- net/netfilter/nft_connlimit.c | 3 ++- net/netfilter/nft_counter.c | 3 ++- net/netfilter/nft_ct.c | 6 ++++-- net/netfilter/nft_dup_netdev.c | 3 ++- net/netfilter/nft_dynset.c | 3 ++- net/netfilter/nft_exthdr.c | 6 ++++-- net/netfilter/nft_fib.c | 2 +- net/netfilter/nft_flow_offload.c | 3 ++- net/netfilter/nft_fwd_netdev.c | 6 ++++-- net/netfilter/nft_hash.c | 4 ++-- net/netfilter/nft_immediate.c | 3 ++- net/netfilter/nft_last.c | 3 ++- net/netfilter/nft_limit.c | 5 +++-- net/netfilter/nft_log.c | 3 ++- net/netfilter/nft_lookup.c | 3 ++- net/netfilter/nft_masq.c | 3 ++- net/netfilter/nft_meta.c | 5 +++-- net/netfilter/nft_nat.c | 3 ++- net/netfilter/nft_numgen.c | 6 ++++-- net/netfilter/nft_objref.c | 6 ++++-- net/netfilter/nft_osf.c | 3 ++- net/netfilter/nft_payload.c | 6 ++++-- net/netfilter/nft_queue.c | 6 ++++-- net/netfilter/nft_quota.c | 3 ++- net/netfilter/nft_range.c | 3 ++- net/netfilter/nft_redir.c | 3 ++- net/netfilter/nft_reject.c | 3 ++- net/netfilter/nft_rt.c | 2 +- net/netfilter/nft_socket.c | 2 +- net/netfilter/nft_synproxy.c | 3 ++- net/netfilter/nft_tproxy.c | 2 +- net/netfilter/nft_tunnel.c | 2 +- net/netfilter/nft_xfrm.c | 2 +- 44 files changed, 104 insertions(+), 59 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 26411f1386227..5589fa4c20236 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -903,7 +903,8 @@ struct nft_expr_ops { void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, + bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 237f3757637e1..73482be7d61fa 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -18,7 +18,7 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 2dce55c736f40..8e923794c91bc 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -23,10 +23,10 @@ int nft_meta_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 56b123a42220e..6d9ba62efd750 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -22,7 +22,8 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset); int nft_reject_icmp_code(u8 code); int nft_reject_icmpv6_code(u8 code); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index aeb631760eb9e..cae5b38335b3a 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -52,7 +52,8 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, return err; } -static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_ipv4_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 3a00d95e964e9..e859beb29bb11 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -50,7 +50,8 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, return err; } -static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_ipv6_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dc17f48684fe5..341536f718069 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2782,7 +2782,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr) < 0) + if (expr->ops->dump(skb, expr, false) < 0) goto nla_put_failure; nla_nest_end(skb, data); } diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 47b0dba95054f..6eba4bf42f2c0 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -230,7 +230,8 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } -static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_bitwise_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_bitwise *priv = nft_expr_priv(expr); int err = 0; @@ -344,7 +345,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, } static int -nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +nft_bitwise_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); struct nft_data data; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 7b0b8fecb2205..73d6bfbdd1d7e 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -147,7 +147,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, priv->len); } -static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_byteorder_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 47b6d05f1ae69..ee29b94149f23 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -96,7 +96,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } -static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); @@ -242,7 +243,8 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, return __nft_cmp_offload(ctx, flow, &cmp); } -static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index d354cefa783ca..e3c17b5d12d66 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -326,7 +326,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr, return 0; } -static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_target_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); @@ -574,12 +575,14 @@ static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, return -1; } -static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_match_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { return __nft_match_dump(skb, expr, nft_expr_priv(expr)); } -static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e) +static int nft_match_large_dump(struct sk_buff *skb, + const struct nft_expr *e, bool reset) { struct nft_xt_match_priv *priv = nft_expr_priv(e); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 7d0761fad37ef..2fe5ae339cfa8 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -171,7 +171,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr, nft_connlimit_do_eval(priv, regs, pkt, NULL); } -static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_connlimit_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_connlimit *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 8edd3b3c173d7..066b996cf9c70 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -201,7 +201,8 @@ static void nft_counter_eval(const struct nft_expr *expr, nft_counter_do_eval(priv, regs, pkt); } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_counter_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 9c7472af9e4a1..0378691a01d91 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -638,7 +638,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_get_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); @@ -677,7 +678,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) return -1; } -static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 5b5c607fbf83f..2007700bef8e6 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -44,7 +44,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, sizeof(int)); } -static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_netdev *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 29c7ae8789e95..aa288e2488d50 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -357,7 +357,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dynset_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 3609680831a14..7b432f17ab434 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -503,7 +503,8 @@ static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr * return -1; } -static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); @@ -513,7 +514,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump_set(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index b10ce732b337c..b125efa18db8c 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -114,7 +114,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } EXPORT_SYMBOL_GPL(nft_fib_init); -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_fib *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index aac6db8680d47..6db8c802d5e76 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -410,7 +410,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_flow_offload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 08e7a289738e0..a534d060ce1b6 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -56,7 +56,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, sizeof(int)); } -static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); @@ -186,7 +187,8 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, addr_len); } -static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_neigh_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index f829f5289e162..4fc99c80b28e2 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -139,7 +139,7 @@ static int nft_symhash_init(const struct nft_ctx *ctx, } static int nft_jhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_jhash *priv = nft_expr_priv(expr); @@ -166,7 +166,7 @@ static int nft_jhash_dump(struct sk_buff *skb, } static int nft_symhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_symhash *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 3042b32310fae..a927234e16469 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -203,7 +203,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, } } -static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_immediate_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 304e33cbed9b4..7935f4849a444 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -54,7 +54,8 @@ static void nft_last_eval(const struct nft_expr *expr, WRITE_ONCE(priv->last_set, 1); } -static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_last_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_last_priv *priv = nft_expr_priv(expr); unsigned long last_jiffies = READ_ONCE(priv->last_jiffies); diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c626dc10df78e..6675014fa080d 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -168,7 +168,8 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, return 0; } -static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_pkts_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_limit_pkts *priv = nft_expr_priv(expr); @@ -205,7 +206,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, } static int nft_limit_bytes_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_limit *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 54f6c2035e84d..84e3744118773 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -241,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx, nf_logger_put(ctx->family, li->type); } -static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_log_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_log *priv = nft_expr_priv(expr); const struct nf_loginfo *li = &priv->loginfo; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 9d18c5428d53c..2fff95e0740a6 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -178,7 +178,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_lookup_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 1818dbf089cad..667ab4ec09914 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -73,7 +73,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_masq_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_masq *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2f8686b711cf4..7fb92f2683b26 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -667,7 +667,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_meta_set_init); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -682,7 +682,8 @@ int nft_meta_get_dump(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index cd4eb4996aff3..296667e254201 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -255,7 +255,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return nf_ct_netns_get(ctx->net, family); } -static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_nat_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 4e43214e88def..f3391583abe44 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -89,7 +89,8 @@ static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, return -1; } -static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_inc_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_inc *priv = nft_expr_priv(expr); @@ -137,7 +138,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NULL, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_random_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_random *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 3ff91bcaa5f24..e873401182899 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -47,7 +47,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return 0; } -static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); @@ -155,7 +156,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, return 0; } -static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_map_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 720dc9fba6d4f..12b5dedd006e8 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -92,7 +92,8 @@ static int nft_osf_init(const struct nft_ctx *ctx, return 0; } -static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_osf_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_osf *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 92422bc720a88..a5a89c238c3ba 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -150,7 +150,8 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->len); } -static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload *priv = nft_expr_priv(expr); @@ -708,7 +709,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->len); } -static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload_set *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 9ba1de51ac070..6758d9d72ca05 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -127,7 +127,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, return 0; } -static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_queue_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); @@ -143,7 +144,8 @@ static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) } static int -nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +nft_queue_sreg_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c4d1389f7185a..9fc97ff20f598 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -198,7 +198,8 @@ static int nft_quota_init(const struct nft_ctx *ctx, return nft_quota_do_init(tb, priv); } -static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_quota_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_quota *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index e4a1c44d7f513..8c10fc727b14e 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -114,7 +114,8 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr return err; } -static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_range_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_range_expr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index e64f531d66cfc..a42f3620dd3e1 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -75,7 +75,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_redir_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_redir *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 927ff8459bd90..f2addc844dd2d 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -69,7 +69,8 @@ int nft_reject_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_reject_init); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index bcd01a63e38f1..63a6069aa02b9 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -146,7 +146,7 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, } static int nft_rt_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_rt *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 9ad9cc0d1d27c..dac1797e98e85 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -196,7 +196,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, } static int nft_socket_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_socket *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 1133e06f3c40e..bf7268908154a 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -272,7 +272,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx, nft_synproxy_do_destroy(ctx); } -static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_synproxy_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 9fea90ed79d44..4909f940f5cc5 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -294,7 +294,7 @@ static void nft_tproxy_destroy(const struct nft_ctx *ctx, } static int nft_tproxy_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tproxy *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 2ee50996da8cc..a5b6ba4779d1f 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -106,7 +106,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, } static int nft_tunnel_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tunnel *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index cbbbc4ecad3ae..f823007f9f70f 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -210,7 +210,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr, } static int nft_xfrm_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); From 6ae8fd31e0a96a17e1f463ad11d363bd2de8767c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:44:29 +0100 Subject: [PATCH 32/93] netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Phil Sutter commit 8daa8fde3fc3f069ff0b5c87079a5c1df7743113 upstream-diff Used the cleanly applying 9.4 backport ce1ee31e2ba4b463a7eff82bbfc2a0dc3b8dc2df Analogous to NFT_MSG_GETOBJ_RESET, but for rules: Reset stateful expressions like counters or quotas. The latter two are the only consumers, adjust their 'dump' callbacks to respect the parameter introduced earlier. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 8daa8fde3fc3f069ff0b5c87079a5c1df7743113) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 49 ++++++++++++++++-------- net/netfilter/nft_counter.c | 2 +- net/netfilter/nft_dynset.c | 4 +- net/netfilter/nft_quota.c | 2 +- 6 files changed, 40 insertions(+), 21 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5589fa4c20236..20fd5c9c5a213 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -360,7 +360,7 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); struct nft_set_ext; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 08db4ee06ab6f..9488cab9866d1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -97,6 +97,7 @@ enum nft_verdicts { * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -124,6 +125,7 @@ enum nf_tables_msg_types { NFT_MSG_NEWFLOWTABLE, NFT_MSG_GETFLOWTABLE, NFT_MSG_DELFLOWTABLE, + NFT_MSG_GETRULE_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 341536f718069..092ba48f65742 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2772,7 +2772,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { }; static int nf_tables_fill_expr_info(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; @@ -2782,7 +2782,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr, false) < 0) + if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } @@ -2794,14 +2794,14 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; - if (nf_tables_fill_expr_info(skb, expr) < 0) + if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; @@ -3010,7 +3010,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, u64 handle) + const struct nft_rule *rule, u64 handle, + bool reset) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -3043,7 +3044,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); @@ -3094,7 +3095,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, rule, handle); + ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; @@ -3115,7 +3116,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, - const struct nft_chain *chain) + const struct nft_chain *chain, + bool reset) { struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; @@ -3142,7 +3144,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle) < 0) + table, chain, rule, handle, reset) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3165,6 +3167,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + bool reset = false; + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; rcu_read_lock(); nft_net = nft_pernet(net); @@ -3189,14 +3195,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, - cb, table, chain); + cb, table, chain, reset); break; } goto done; } list_for_each_entry_rcu(chain, &table->chains, list) { - if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) + if (__nf_tables_dump_rules(skb, &idx, + cb, table, chain, reset)) goto done; } @@ -3267,6 +3274,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; + bool reset = false; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { @@ -3303,9 +3311,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, if (!skb2) return -ENOMEM; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, 0); + family, table, chain, rule, 0, reset); if (err < 0) goto err_fill_rule_info; @@ -4139,7 +4150,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); @@ -4150,7 +4161,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, for (i = 0; i < set->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - set->exprs[i]) < 0) + set->exprs[i], false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5062,7 +5073,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) return -1; return 0; @@ -5073,7 +5084,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -8375,6 +8386,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_GETRULE_RESET] = { + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_DELRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 066b996cf9c70..fc2b892a48ea1 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -206,7 +206,7 @@ static int nft_counter_dump(struct sk_buff *skb, { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - return nft_counter_do_dump(skb, priv, false); + return nft_counter_do_dump(skb, priv, reset); } static int nft_counter_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index aa288e2488d50..57ab41a5fb498 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -380,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb, if (priv->set->num_exprs == 0) { if (priv->num_exprs == 1) { if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, - priv->expr_array[0])) + priv->expr_array[0], reset)) goto nla_put_failure; } else if (priv->num_exprs > 1) { struct nlattr *nest; @@ -391,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb, for (i = 0; i < priv->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - priv->expr_array[i])) + priv->expr_array[i], reset)) goto nla_put_failure; } nla_nest_end(skb, nest); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 9fc97ff20f598..133807253498b 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -203,7 +203,7 @@ static int nft_quota_dump(struct sk_buff *skb, { struct nft_quota *priv = nft_expr_priv(expr); - return nft_quota_do_dump(skb, priv, false); + return nft_quota_do_dump(skb, priv, reset); } static struct nft_expr_type nft_quota_type; From 8a0661f88958bb686011c0d7682d67d9f4cc1e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:46:06 +0100 Subject: [PATCH 33/93] netfilter: nf_tables: set element extended ACK reporting support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit b53c116642502b0c85ecef78bff4f826a7dd4145 Report the element that causes problems via netlink extended ACK for set element commands. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit b53c116642502b0c85ecef78bff4f826a7dd4145) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 092ba48f65742..86c7d795efadc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5571,8 +5571,10 @@ static int nf_tables_getsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&ctx, set, attr); - if (err < 0) + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); break; + } } return err; @@ -6401,8 +6403,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); - if (err < 0) + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); return err; + } } if (nft_net->validate_state == NFT_VALIDATE_DO) @@ -6674,8 +6678,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); - if (err < 0) + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); break; + } } return err; } From a4771f950250c2e9cc4513392de355d8b944a5b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:46:11 +0100 Subject: [PATCH 34/93] netfilter: nf_tables: add support to destroy operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Fernando Fernandez Mancera commit f80a612dd77c4585171e44a06b490466bdeec1ae Introduce NFT_MSG_DESTROY* message type. The destroy operation performs a delete operation but ignoring the ENOENT errors. This is useful for the transaction semantics, where failing to delete an object which does not exist results in aborting the transaction. This new command allows the transaction to proceed in case the object does not exist. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit f80a612dd77c4585171e44a06b490466bdeec1ae) Signed-off-by: Marcin Wcisło --- include/uapi/linux/netfilter/nf_tables.h | 14 +++ net/netfilter/nf_tables_api.c | 111 +++++++++++++++++++++-- 2 files changed, 117 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9488cab9866d1..a6236b5256e94 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -98,6 +98,13 @@ enum nft_verdicts { * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) + * @NFT_MSG_DESTROYTABLE: destroy a table (enum nft_table_attributes) + * @NFT_MSG_DESTROYCHAIN: destroy a chain (enum nft_chain_attributes) + * @NFT_MSG_DESTROYRULE: destroy a rule (enum nft_rule_attributes) + * @NFT_MSG_DESTROYSET: destroy a set (enum nft_set_attributes) + * @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes) + * @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes) + * @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -126,6 +133,13 @@ enum nf_tables_msg_types { NFT_MSG_GETFLOWTABLE, NFT_MSG_DELFLOWTABLE, NFT_MSG_GETRULE_RESET, + NFT_MSG_DESTROYTABLE, + NFT_MSG_DESTROYCHAIN, + NFT_MSG_DESTROYRULE, + NFT_MSG_DESTROYSET, + NFT_MSG_DESTROYSETELEM, + NFT_MSG_DESTROYOBJ, + NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 86c7d795efadc..1ea7696c24e61 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1413,6 +1413,10 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(table)) { + if (PTR_ERR(table) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } @@ -2640,6 +2644,10 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { + if (PTR_ERR(chain) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } @@ -3736,6 +3744,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } @@ -3749,6 +3761,10 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { + if (PTR_ERR(rule) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) + return 0; + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } @@ -4824,6 +4840,10 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(set)) { + if (PTR_ERR(set) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } @@ -6678,6 +6698,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); + if (err == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) + continue; + if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -7322,6 +7346,10 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, } if (IS_ERR(obj)) { + if (PTR_ERR(obj) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } @@ -7953,6 +7981,10 @@ static int nf_tables_delflowtable(struct sk_buff *skb, } if (IS_ERR(flowtable)) { + if (PTR_ERR(flowtable) == -ENOENT && + NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) + return 0; + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } @@ -8362,6 +8394,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, + [NFT_MSG_DESTROYTABLE] = { + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_TABLE_MAX, + .policy = nft_table_policy, + }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, @@ -8380,6 +8418,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, + [NFT_MSG_DESTROYCHAIN] = { + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_CHAIN_MAX, + .policy = nft_chain_policy, + }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, @@ -8404,6 +8448,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_DESTROYRULE] = { + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, @@ -8422,6 +8472,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, + [NFT_MSG_DESTROYSET] = { + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_MAX, + .policy = nft_set_policy, + }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, @@ -8440,6 +8496,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_DESTROYSETELEM] = { + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, @@ -8462,6 +8524,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_DESTROYOBJ] = { + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, @@ -8486,6 +8554,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, + [NFT_MSG_DESTROYFLOWTABLE] = { + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static int nf_tables_validate(struct net *net) @@ -8581,6 +8655,7 @@ static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: @@ -8588,23 +8663,29 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: nf_tables_set_elem_destroy(&trans->ctx, nft_trans_elem_set(trans), nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else @@ -9007,8 +9088,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: list_del_rcu(&trans->ctx.table->list); - nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + nf_tables_table_notify(&trans->ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { @@ -9023,8 +9105,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: nft_chain_del(trans->ctx.chain); - nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + nf_tables_chain_notify(&trans->ctx, trans->msg_type); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, trans->ctx.chain); @@ -9040,10 +9123,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), - NFT_MSG_DELRULE); + trans->msg_type); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); @@ -9071,9 +9155,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), - NFT_MSG_DELSET, GFP_KERNEL); + trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -9085,11 +9170,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM); + trans->msg_type); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9111,9 +9197,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_DELOBJ); + trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { @@ -9135,11 +9222,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { @@ -9147,7 +9235,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list, - NFT_MSG_DELFLOWTABLE); + trans->msg_type); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); } @@ -9243,6 +9331,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELTABLE: + case NFT_MSG_DESTROYTABLE: nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; @@ -9264,6 +9353,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: trans->ctx.table->use++; nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); @@ -9282,6 +9372,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); @@ -9303,6 +9394,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: + case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); @@ -9318,6 +9410,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: + case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); @@ -9337,6 +9430,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELOBJ: + case NFT_MSG_DESTROYOBJ: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); @@ -9353,6 +9447,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } break; case NFT_MSG_DELFLOWTABLE: + case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); From 75be23083dc5b0c02c73b58b42c0e67bdd0b7594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:46:30 +0100 Subject: [PATCH 35/93] netfilter: nf_tables: fix wrong pointer passed to PTR_ERR() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Yang Yingliang commit 1fb7696ac6c3ac470dd002e639af80e7e170b25f It should be 'chain' passed to PTR_ERR() in the error path after calling nft_chain_lookup() in nf_tables_delrule(). Fixes: f80a612dd77c ("netfilter: nf_tables: add support to destroy operation") Signed-off-by: Yang Yingliang Reviewed-by: Simon Horman Acked-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 1fb7696ac6c3ac470dd002e639af80e7e170b25f) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1ea7696c24e61..f1979cbeccbee 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3744,7 +3744,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { - if (PTR_ERR(rule) == -ENOENT && + if (PTR_ERR(chain) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) return 0; From b39b3ba76ed50cc3713365e2a5ed8de4b146e334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:46:32 +0100 Subject: [PATCH 36/93] netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Phil Sutter commit 079cd633219d7298d087cd115c17682264244c18 Analogous to NFT_MSG_GETOBJ_RESET, but for set elements with a timeout or attached stateful expressions like counters or quotas - reset them all at once. Respect a per element timeout value if present to reset the 'expires' value to. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 079cd633219d7298d087cd115c17682264244c18) Signed-off-by: Marcin Wcisło --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 68 +++++++++++++++++------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a6236b5256e94..67b73b5a8d5a5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -105,6 +105,7 @@ enum nft_verdicts { * @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes) * @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes) * @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -140,6 +141,7 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYSETELEM, NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, + NFT_MSG_GETSETELEM_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f1979cbeccbee..3ea312aca5475 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5080,7 +5080,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_ext *ext) + const struct nft_set_ext *ext, + bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; @@ -5093,7 +5094,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; @@ -5104,7 +5105,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5117,11 +5118,13 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + u64 timeout = 0; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) @@ -5144,7 +5147,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && - nft_set_elem_expr_dump(skb, set, ext)) + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5157,11 +5160,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + timeout = *nft_set_ext_timeout(ext); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + nf_jiffies64_to_msecs(timeout), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = READ_ONCE(set->timeout); + } if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { u64 expires, now = get_jiffies_64(); @@ -5176,6 +5183,9 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; + + if (reset) + *nft_set_ext_expiration(ext) = now + timeout; } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5199,6 +5209,7 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, @@ -5209,7 +5220,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set_dump_args *args; args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } struct nft_set_dump_ctx { @@ -5218,7 +5229,7 @@ struct nft_set_dump_ctx { }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, - const struct nft_set *set) + const struct nft_set *set, bool reset) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); @@ -5233,7 +5244,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, continue; elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem); + ret = nf_tables_fill_setelem(skb, set, &elem, reset); break; } @@ -5251,6 +5262,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; + bool reset = false; u32 portid, seq; int event; @@ -5298,8 +5310,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + args.cb = cb; args.skb = skb; + args.reset = reset; args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; @@ -5308,7 +5324,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) - args.iter.err = nft_set_catchall_dump(net, skb, set); + args.iter.err = nft_set_catchall_dump(net, skb, set, reset); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -5346,7 +5362,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; @@ -5367,7 +5384,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem, reset); if (err < 0) goto nla_put_failure; @@ -5487,7 +5504,7 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, } static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; @@ -5531,7 +5548,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, &elem, + reset); if (err < 0) goto err_fill_setelem; @@ -5555,6 +5573,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; + bool reset = false; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, @@ -5589,8 +5608,11 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); + err = nft_get_set_elem(&ctx, set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -5623,7 +5645,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem, false); if (err < 0) { kfree_skb(skb); goto err; @@ -8490,6 +8512,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, From fecc5cd39d129b49691562be0bfe146422800e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:47:13 +0100 Subject: [PATCH 37/93] netfilter: nf_tables: upfront validation of data via nft_data_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 341b6941608762d8235f3fd1e45e4d7114ed8c2c upstream-diff Used the cleanly applying 9.4 backport 2313c5293d5c608023f736efdd03b54191d9379b Instead of parsing the data and then validate that type and length are correct, pass a description of the expected data so it can be validated upfront before parsing it to bail out earlier. This patch adds a new .size field to specify the maximum size of the data area. The .len field is optional and it is used as an input/output field, it provides the specific length of the expected data in the input path. If then .len field is not specified, then obtained length from the netlink attribute is stored. This is required by cmp, bitwise, range and immediate, which provide no netlink attribute that describes the data length. The immediate expression uses the destination register type to infer the expected data type. Relying on opencoded validation of the expected data might lead to subtle bugs as described in 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data"). Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 341b6941608762d8235f3fd1e45e4d7114ed8c2c) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 4 +- net/netfilter/nf_tables_api.c | 81 ++++++++++++++++--------------- net/netfilter/nft_bitwise.c | 67 ++++++++++++------------- net/netfilter/nft_cmp.c | 36 ++++++-------- net/netfilter/nft_immediate.c | 22 +++++++-- net/netfilter/nft_range.c | 27 +++++------ 6 files changed, 123 insertions(+), 114 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 20fd5c9c5a213..1d8705e9732a5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -197,12 +197,12 @@ enum nft_data_desc_flags { struct nft_data_desc { enum nft_data_types type; + unsigned int size; unsigned int len; unsigned int flags; }; -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3ea312aca5475..930a672b5d8b0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5420,19 +5420,13 @@ static int nft_setelem_parse_flags(const struct nft_set *set, static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *key, struct nlattr *attr) { - struct nft_data_desc desc; - int err; - - err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); - if (err < 0) - return err; - - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(key, desc.type); - return -EINVAL; - } + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = NFT_DATA_VALUE_MAXLEN, + .len = set->klen, + }; - return 0; + return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, @@ -5441,26 +5435,18 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nlattr *attr) { u32 dtype; - int err; - - desc->flags = NFT_DATA_DESC_SETELEM; - - err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); - if (err < 0) - return err; if (set->dtype == NFT_DATA_VERDICT) dtype = NFT_DATA_VERDICT; else dtype = NFT_DATA_VALUE; - if (dtype != desc->type || - set->dlen != desc->len) { - nft_data_release(data, desc->type); - return -EINVAL; - } + desc->type = dtype; + desc->size = NFT_DATA_VALUE_MAXLEN; + desc->len = set->dlen; + desc->flags = NFT_DATA_DESC_SETELEM; - return 0; + return nft_data_init(ctx, data, desc, attr); } static void *nft_setelem_catchall_get(const struct net *net, @@ -9931,7 +9917,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, } desc->len = sizeof(data->verdict); - desc->type = NFT_DATA_VERDICT; + return 0; } @@ -9974,20 +9960,25 @@ int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v) } static int nft_value_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, - struct nft_data_desc *desc, const struct nlattr *nla) + struct nft_data *data, struct nft_data_desc *desc, + const struct nlattr *nla) { unsigned int len; len = nla_len(nla); if (len == 0) return -EINVAL; - if (len > size) + if (len > desc->size) return -EOVERFLOW; + if (desc->len) { + if (len != desc->len) + return -EINVAL; + } else { + desc->len = len; + } nla_memcpy(data->data, nla, len); - desc->type = NFT_DATA_VALUE; - desc->len = len; + return 0; } @@ -10007,7 +9998,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * * @ctx: context of the expression using the data * @data: destination struct nft_data - * @size: maximum data length * @desc: data description * @nla: netlink attribute containing data * @@ -10017,24 +10007,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; + if (WARN_ON_ONCE(!desc->size)) + return -EINVAL; + err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; - if (tb[NFTA_DATA_VALUE]) - return nft_value_init(ctx, data, size, desc, - tb[NFTA_DATA_VALUE]); - if (tb[NFTA_DATA_VERDICT] && ctx != NULL) - return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); - return -EINVAL; + if (tb[NFTA_DATA_VALUE]) { + if (desc->type != NFT_DATA_VALUE) + return -EINVAL; + + err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) { + if (desc->type != NFT_DATA_VERDICT) + return -EINVAL; + + err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); + } else { + err = -EINVAL; + } + + return err; } EXPORT_SYMBOL_GPL(nft_data_init); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 6eba4bf42f2c0..5bb2c61b6782e 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { static int nft_bitwise_init_bool(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc mask, xor; + struct nft_data_desc mask = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->mask), + .len = priv->len, + }; + struct nft_data_desc xor = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->xor), + .len = priv->len, + }; int err; if (tb[NFTA_BITWISE_DATA]) @@ -103,36 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, - tb[NFTA_BITWISE_MASK]); + err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { - err = -EINVAL; - goto err1; - } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, - tb[NFTA_BITWISE_XOR]); + err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) - goto err1; - if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { - err = -EINVAL; - goto err2; - } + goto err_xor_err; return 0; -err2: - nft_data_release(&priv->xor, xor.type); -err1: + +err_xor_err: nft_data_release(&priv->mask, mask.type); + return err; } static int nft_bitwise_init_shift(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = sizeof(u32), + }; int err; if (tb[NFTA_BITWISE_MASK] || @@ -142,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, if (!tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, - tb[NFTA_BITWISE_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]); if (err < 0) return err; - if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || - priv->data.data[0] >= BITS_PER_TYPE(u32)) { - nft_data_release(&priv->data, d.type); + + if (priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, desc.type); return -EINVAL; } @@ -291,22 +293,21 @@ static const struct nft_expr_ops nft_bitwise_ops = { static int nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out) { - struct nft_data_desc desc; struct nft_data data; - int err = 0; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + .len = sizeof(u32), + }; + int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, tb); + err = nft_data_init(NULL, &data, &desc, tb); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) { - err = -EINVAL; - goto err; - } *out = data.data[0]; -err: - nft_data_release(&data, desc.type); - return err; + + return 0; } static int nft_bitwise_fast_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index ee29b94149f23..5e625f5120bed 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -73,20 +73,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + }; int err; - err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; - if (desc.type != NFT_DATA_VALUE) { - err = -EINVAL; - nft_data_release(&priv->data, desc.type); - return err; - } - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -202,12 +198,14 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; int err; - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; @@ -277,8 +275,11 @@ const struct nft_expr_ops nft_cmp_fast_ops = { static const struct nft_expr_ops * nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { - struct nft_data_desc desc; struct nft_data data; + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(data), + }; enum nft_cmp_ops op; int err; @@ -300,21 +301,14 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return ERR_PTR(-EINVAL); } - err = nft_data_init(NULL, &data, sizeof(data), &desc, - tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); - if (desc.type != NFT_DATA_VALUE) - goto err1; - if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ)) return &nft_cmp_fast_ops; return &nft_cmp_ops; -err1: - nft_data_release(&data, desc.type); - return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index a927234e16469..7c810005a1f9f 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -29,20 +29,36 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; +static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) +{ + enum nft_data_types type; + u8 reg; + + reg = ntohl(nla_get_be32(nla)); + if (reg == NFT_REG_VERDICT) + type = NFT_DATA_VERDICT; + else + type = NFT_DATA_VALUE; + + return type; +} + static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc; + struct nft_data_desc desc = { + .size = sizeof(priv->data), + }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; - err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc, - tb[NFTA_IMMEDIATE_DATA]); + desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); + err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 8c10fc727b14e..f8258d5202297 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr const struct nlattr * const tb[]) { struct nft_range_expr *priv = nft_expr_priv(expr); - struct nft_data_desc desc_from, desc_to; + struct nft_data_desc desc_from = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_from), + }; + struct nft_data_desc desc_to = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data_to), + }; int err; u32 op; @@ -61,26 +68,16 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr !tb[NFTA_RANGE_TO_DATA]) return -EINVAL; - err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), - &desc_from, tb[NFTA_RANGE_FROM_DATA]); + err = nft_data_init(NULL, &priv->data_from, &desc_from, + tb[NFTA_RANGE_FROM_DATA]); if (err < 0) return err; - if (desc_from.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err1; - } - - err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), - &desc_to, tb[NFTA_RANGE_TO_DATA]); + err = nft_data_init(NULL, &priv->data_to, &desc_to, + tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; - if (desc_to.type != NFT_DATA_VALUE) { - err = -EINVAL; - goto err2; - } - if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; From dfc61266e1a17b38957df503390ede1eaa123d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:48:08 +0100 Subject: [PATCH 38/93] netfilter: nf_tables: integrate pipapo into commit protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 212ed75dc5fb9d1423b3942c8f872a868cda3466 The pipapo set backend follows copy-on-update approach, maintaining one clone of the existing datastructure that is being updated. The clone and current datastructures are swapped via rcu from the commit step. The existing integration with the commit protocol is flawed because there is no operation to clean up the clone if the transaction is aborted. Moreover, the datastructure swap happens on set element activation. This patch adds two new operations for sets: commit and abort, these new operations are invoked from the commit and abort steps, after the transactions have been digested, and it updates the pipapo set backend to use it. This patch adds a new ->pending_update field to sets to maintain a list of sets that require this new commit and abort operations. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 212ed75dc5fb9d1423b3942c8f872a868cda3466) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 4 ++- net/netfilter/nf_tables_api.c | 56 +++++++++++++++++++++++++++++++ net/netfilter/nft_set_pipapo.c | 55 +++++++++++++++++++++--------- 3 files changed, 99 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1d8705e9732a5..1187929715274 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -425,7 +425,8 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - + void (*commit)(const struct nft_set *set); + void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, @@ -520,6 +521,7 @@ struct nft_set { u16 policy; u16 udlen; unsigned char *udata; + struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:14, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 930a672b5d8b0..ec4ab381cb337 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4760,6 +4760,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) @@ -9015,10 +9016,25 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) } } +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit) + continue; + + set->ops->commit(set); + } +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; @@ -9181,6 +9197,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -9195,6 +9216,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) atomic_dec(&te->set->nelems); te->set->ndeact--; } + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -9257,6 +9283,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } + nft_set_commit_update(&set_update_list); + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); @@ -9313,10 +9341,25 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } +static void nft_set_abort_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; if (action == NFNL_ABORT_VALIDATE && @@ -9422,6 +9465,12 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) atomic_dec(&te->set->nelems); + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: @@ -9432,6 +9481,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) if (!nft_setelem_is_catchall(te->set, &te->elem)) te->set->ndeact--; + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: @@ -9474,6 +9528,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } } + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 11e24f56bf805..899fc8911191c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1600,17 +1600,10 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); @@ -1625,7 +1618,19 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1635,7 +1640,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1660,6 +1665,26 @@ static void pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *m; + + if (!priv->dirty) + return; + + m = rcu_dereference(priv->match); + + new_clone = pipapo_clone(m); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + pipapo_free_match(priv->clone); + priv->clone = new_clone; +} + /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace @@ -1667,8 +1692,7 @@ static void pipapo_commit(const struct nft_set *set) * @elem: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ @@ -1684,8 +1708,6 @@ static void nft_pipapo_activate(const struct net *net, nft_set_elem_change_active(net, set, &e->ext); nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); } /** @@ -1935,7 +1957,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, if (i == m->field_count) { priv->dirty = true; pipapo_drop(m, rulemap); - pipapo_commit(set); return; } @@ -2238,6 +2259,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2260,6 +2283,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; From 2efc7fd6d860ea6ef07ed6a9c7502e9d4171832e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:48:12 +0100 Subject: [PATCH 39/93] netfilter: nf_tables: relax set/map validation checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit a4878eeae39048e6abe85891c714b49dc13fc08c Its currently not allowed to perform queries on a map, for example: table t { map m { typeof ip saddr : meta mark .. chain c { ip saddr @m counter will fail, because kernel requires that userspace provides a destination register when the referenced set is a map. However, internally there is no real distinction between sets and maps, maps are just sets where each key is associated with a value. Relax this so that maps can be used just like sets. This allows to have rules that query if a given key exists without making use of the associated value. This also permits != checks which don't work for map lookups. When no destination reg is given for a map, then permit this for named maps. Data and dump paths need to be updated to consider priv->dreg_set instead of the 'set-is-a-map' check. Checks in reduce and validate callbacks are not changed, this can be relaxed later if a need arises. Signed-off-by: Florian Westphal (cherry picked from commit a4878eeae39048e6abe85891c714b49dc13fc08c) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_lookup.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 2fff95e0740a6..5ba799a7722d2 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -19,6 +19,7 @@ struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; + bool dreg_set; bool invert; struct nft_set_binding binding; }; @@ -75,7 +76,7 @@ void nft_lookup_eval(const struct nft_expr *expr, } if (ext) { - if (set->flags & NFT_SET_MAP) + if (priv->dreg_set) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); @@ -122,11 +123,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (flags & ~NFT_LOOKUP_F_INV) return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) { - if (set->flags & NFT_SET_MAP) - return -EINVAL; + if (flags & NFT_LOOKUP_F_INV) priv->invert = true; - } } if (tb[NFTA_LOOKUP_DREG] != NULL) { @@ -140,8 +138,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx, set->dlen); if (err < 0) return err; - } else if (set->flags & NFT_SET_MAP) - return -EINVAL; + priv->dreg_set = true; + } else if (set->flags & NFT_SET_MAP) { + /* Map given, but user asks for lookup only (i.e. to + * ignore value assoicated with key). + * + * This makes no sense for anonymous maps since they are + * scoped to the rule, but for named sets this can be useful. + */ + if (set->flags & NFT_SET_ANONYMOUS) + return -EINVAL; + } priv->binding.flags = set->flags & NFT_SET_MAP; @@ -188,7 +195,7 @@ static int nft_lookup_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; - if (priv->set->flags & NFT_SET_MAP) + if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) From ba24727bdd7b13e6ce3028a0837fd52d64be9f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:48:15 +0100 Subject: [PATCH 40/93] netfilter: nft_set_pipapo: .walk does not deal with generations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 2b84e215f87443c74ac0aa7f76bb172d43a87033 The .walk callback iterates over the current active set, but it might be useful to iterate over the next generation set. Use the generation mask to determine what set view (either current or next generation) is use for the walk iteration. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 2b84e215f87443c74ac0aa7f76bb172d43a87033) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 899fc8911191c..91a0689e278a2 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1978,12 +1978,16 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int i, r; rcu_read_lock(); - m = rcu_dereference(priv->match); + if (iter->genmask == nft_genmask_cur(net)) + m = rcu_dereference(priv->match); + else + m = priv->clone; if (unlikely(!m)) goto out; From cde47464e35e44174dc8d0d689ff3129d36538f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:48:57 +0100 Subject: [PATCH 41/93] netfilter: nf_tables: validate variable length element extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce upstream-diff Used the cleanly applying 9.4 backport f74ccdb808d654e842e3b283cc3d77fc6e629fe8 Update template to validate variable length extensions. This patch adds a new .ext_len[id] field to the template to store the expected extension length. This is used to sanity check the initialization of the variable length extension. Use PTR_ERR() in nft_set_elem_init() to report errors since, after this update, there are two reason why this might fail, either because of ENOMEM or insufficient room in the extension field (EINVAL). Kernels up until 7e6bc1f6cabc ("netfilter: nf_tables: stricter validation of element data") allowed to copy more data to the extension than was allocated. This ext_len field allows to validate if the destination has the correct size as additional check. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 34aae2c2fb1e3d88a5aeee16715cb6bf0336cdce) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 4 +- net/netfilter/nf_tables_api.c | 84 +++++++++++++++++++++++++------ net/netfilter/nft_dynset.c | 2 +- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1187929715274..40206fdb3169f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -641,6 +641,7 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; + u8 ext_len[NFT_SET_EXT_NUM]; }; /** @@ -670,7 +671,8 @@ static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, return -EINVAL; tmpl->offset[id] = tmpl->len; - tmpl->len += nft_set_ext_types[id].len + len; + tmpl->ext_len[id] = nft_set_ext_types[id].len + len; + tmpl->len += tmpl->ext_len[id]; return 0; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ec4ab381cb337..d217f3e447072 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5686,6 +5686,27 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, return ERR_PTR(err); } +static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len) +{ + len += nft_set_ext_types[id].len; + if (len > tmpl->ext_len[id] || + len > U8_MAX) + return -1; + + return 0; +} + +static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, + void *to, const void *from, u32 len) +{ + if (nft_set_ext_check(tmpl, id, len) < 0) + return -1; + + memcpy(to, from, len); + + return 0; +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, @@ -5696,17 +5717,26 @@ void *nft_set_elem_init(const struct nft_set *set, elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); if (elem == NULL) - return NULL; + return ERR_PTR(-ENOMEM); ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY)) - memcpy(nft_set_ext_key(ext), key, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) - memcpy(nft_set_ext_key_end(ext), key_end, set->klen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - memcpy(nft_set_ext_data(ext), data, set->dlen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY, + nft_set_ext_key(ext), key, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END, + nft_set_ext_key_end(ext), key_end, set->klen) < 0) + goto err_ext_check; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA, + nft_set_ext_data(ext), data, set->dlen) < 0) + goto err_ext_check; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; if (expiration == 0) @@ -5716,6 +5746,11 @@ void *nft_set_elem_init(const struct nft_set *set, *nft_set_ext_timeout(ext) = timeout; return elem; + +err_ext_check: + kfree(elem); + + return ERR_PTR(-EINVAL); } static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, @@ -5803,14 +5838,25 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, } static int nft_set_elem_expr_setup(struct nft_ctx *ctx, + const struct nft_set_ext_tmpl *tmpl, const struct nft_set_ext *ext, struct nft_expr *expr_array[], u32 num_exprs) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + u32 len = sizeof(struct nft_set_elem_expr); struct nft_expr *expr; int i, err; + if (num_exprs == 0) + return 0; + + for (i = 0; i < num_exprs; i++) + len += expr_array[i]->ops->size; + + if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0) + return -EINVAL; + for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); err = nft_expr_clone(expr, expr_array[i]); @@ -6304,17 +6350,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL); - if (elem.priv == NULL) + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); goto err_parse_data; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; + if (ulen > 0) { + if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { + err = -EINVAL; + goto err_elem_userdata; + } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); @@ -6323,14 +6375,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - err = nft_set_elem_expr_setup(ctx, ext, expr_array, num_exprs); + err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) - goto err_elem_expr; + goto err_elem_free; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err_elem_expr; + goto err_elem_free; } ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; @@ -6376,10 +6428,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_remove(ctx->net, set, &elem); err_element_clash: kfree(trans); -err_elem_expr: +err_elem_free: if (obj) obj->use--; - +err_elem_userdata: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) @@ -6554,8 +6606,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, NULL, 0, 0, GFP_KERNEL); - if (elem.priv == NULL) + if (IS_ERR(elem.priv)) { + err = PTR_ERR(elem.priv); goto fail_elem_key_end; + } ext = nft_set_elem_ext(set, elem.priv); if (flags) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 57ab41a5fb498..c3bd57be2ee88 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); - if (elem == NULL) + if (IS_ERR(elem)) goto err1; ext = nft_set_elem_ext(set, elem); From 70c31adf2efe4b5100459728e90897c9fd7388d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:49:54 +0100 Subject: [PATCH 42/93] netfilter: nf_tables: drop map element references from preparation phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 628bd3e49cba1c066228e23d71a852c23e26da73 set .destroy callback releases the references to other objects in maps. This is very late and it results in spurious EBUSY errors. Drop refcount from the preparation phase instead, update set backend not to drop reference counter from set .destroy path. Exceptions: NFT_TRANS_PREPARE_ERROR does not require to drop the reference counter because the transaction abort path releases the map references for each element since the set is unbound. The abort path also deals with releasing reference counter for new elements added to unbound sets. Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 628bd3e49cba1c066228e23d71a852c23e26da73) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 5 +- net/netfilter/nf_tables_api.c | 147 ++++++++++++++++++++++++++---- net/netfilter/nft_set_bitmap.c | 5 +- net/netfilter/nft_set_hash.c | 23 ++++- net/netfilter/nft_set_pipapo.c | 14 ++- net/netfilter/nft_set_rbtree.c | 5 +- 6 files changed, 167 insertions(+), 32 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 40206fdb3169f..7bb24d44f096b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -435,7 +435,8 @@ struct nft_set_ops { int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); - void (*destroy)(const struct nft_set *set); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; @@ -772,6 +773,8 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); /** * struct nft_set_gc_batch_head - nf_tables set garbage collection batch diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d217f3e447072..c56de1cdc551a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -561,6 +561,58 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } +static void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_deactivate(ctx->net, set, elem); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_deactivate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -569,6 +621,9 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); ctx->table->use--; @@ -3438,12 +3493,6 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, return 0; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); @@ -4774,7 +4823,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: - ops->destroy(set); + ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: @@ -4789,7 +4838,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); - nft_set_elem_destroy(set, catchall->elem, true); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } @@ -4804,7 +4853,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); + set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); @@ -4969,10 +5018,60 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_activate(ctx->net, set, elem); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_activate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { - if (nft_set_is_anonymous(set)) + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + nft_clear(ctx->net, set); + } set->use++; } @@ -4993,13 +5092,20 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, set->use--; break; case NFT_TRANS_PREPARE: - if (nft_set_is_anonymous(set)) - nft_deactivate_next(ctx->net, set); + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); + } set->use--; return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + set->use--; fallthrough; default: @@ -5774,6 +5880,7 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, __nft_set_elem_expr_destroy(ctx, expr); } +/* Drop references and destroy. Called from gc, dynset and abort path. */ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5795,11 +5902,11 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_setelem_data_deactivate() already deals - * with the refcounting from the preparation phase. +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); @@ -6432,7 +6539,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (obj) obj->use--; err_elem_userdata: - nf_tables_set_elem_destroy(ctx, set, elem.priv); + nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); @@ -9508,6 +9615,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -10274,6 +10384,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index e7ae5914971e7..60122539fee67 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -271,13 +271,14 @@ static int nft_bitmap_init(const struct nft_set *set, return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, be); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 76de6c8d98655..0b73cb0e752f7 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -400,19 +400,31 @@ static int nft_rhash_init(const struct nft_set *set, return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -643,7 +655,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -653,7 +666,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, he); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 91a0689e278a2..eda2071f8d05c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2160,10 +2160,12 @@ static int nft_pipapo_init(const struct nft_set *set, /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ -static void nft_set_pipapo_match_destroy(const struct nft_set *set, +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; @@ -2180,15 +2182,17 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set, e = f->mt[r].e; - nft_set_elem_destroy(set, e, true); + nf_tables_set_elem_destroy(ctx, set, e); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context * @set: nftables API set representation */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; @@ -2198,7 +2202,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2215,7 +2219,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) m = priv->clone; if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2f114aa10f1a7..5c05c9b990fba 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -664,7 +664,8 @@ static int nft_rbtree_init(const struct nft_set *set, return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; @@ -675,7 +676,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, rbe); } } From 9fc975e897c75f75147f9401c4d31395f665114a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:49:58 +0100 Subject: [PATCH 43/93] netfilter: nf_tables: fix underflow in object reference counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit d6b478666ffa6d2c25386d78bf1c4640d4da305e Since ("netfilter: nf_tables: drop map element references from preparation phase"), integration with commit protocol is better, therefore drop the workaround that b91d90368837 ("netfilter: nf_tables: fix leaking object reference count") provides. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit d6b478666ffa6d2c25386d78bf1c4640d4da305e) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c56de1cdc551a..a93a44c4c9469 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6469,19 +6469,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; - goto err_elem_userdata; + goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; @@ -6536,9 +6536,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err_element_clash: kfree(trans); err_elem_free: - if (obj) - obj->use--; -err_elem_userdata: nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) From a42ed1038fa2812e666e1c75e50f7e8cd3cfaf3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:01 +0100 Subject: [PATCH 44/93] netfilter: nf_tables: disallow element updates of bound anonymous sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit c88c535b592d3baeee74009f3eceeeaf0fdd5e1b Anonymous sets come with NFT_SET_CONSTANT from userspace. Although API allows to create anonymous sets without NFT_SET_CONSTANT, it makes no sense to allow to add and to delete elements for bound anonymous sets. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit c88c535b592d3baeee74009f3eceeeaf0fdd5e1b) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a93a44c4c9469..32119a0309df5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6581,7 +6581,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6855,7 +6856,9 @@ static int nf_tables_delsetelem(struct sk_buff *skb, set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); From 6808ba0a1edd4388d4a1093f79d5cee184c954a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:05 +0100 Subject: [PATCH 45/93] netfilter: nf_tables: reject unbound anonymous set before commit phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 938154b93be8cd611ddfd7bafc1849f3c4355201 Add a new list to track set transaction and to check for unbound anonymous sets before entering the commit phase. Bail out at the end of the transaction handling if an anonymous set remains unbound. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 938154b93be8cd611ddfd7bafc1849f3c4355201) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 35 ++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7bb24d44f096b..ccb3b3e4ce88e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1511,6 +1511,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) * struct nft_trans - nf_tables object update in transaction * * @list: used internally + * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context @@ -1518,6 +1519,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) */ struct nft_trans { struct list_head list; + struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; @@ -1656,6 +1658,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 32119a0309df5..d1aaf6259504f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -153,6 +153,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return NULL; INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -165,9 +166,15 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } @@ -359,6 +366,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + list_add_tail(&trans->list, &nft_net->commit_list); } @@ -8890,7 +8905,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -9208,6 +9223,19 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -9698,7 +9726,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } @@ -10471,6 +10499,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); From 379728bc06257e90a020b376fda6d9be807af79e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:07 +0100 Subject: [PATCH 46/93] netfilter: nf_tables: reject unbound chain set before commit phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 62e1e94b246e685d89c3163aaef4b160e42ceb02 Use binding list to track set transaction and to check for unbound chains before entering the commit phase. Bail out if chain binding remain unused before entering the commit step. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 62e1e94b246e685d89c3163aaef4b160e42ceb02) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d1aaf6259504f..f1f3285642677 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -372,6 +372,11 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&trans->binding_list, &nft_net->binding_list); break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; } list_add_tail(&trans->list, &nft_net->commit_list); @@ -9233,6 +9238,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return -EINVAL; } break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; } } From b3b58763165e124e7b86d4bcab16f0ea3e94da9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:11 +0100 Subject: [PATCH 47/93] netfilter: nf_tables: disallow updates of anonymous sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit b770283c98e0eee9133c47bc03b6cc625dc94723 Disallow updates of set timeout and garbage collection parameters for anonymous sets. Fixes: 123b99619cca ("netfilter: nf_tables: honor set timeout and garbage collection updates") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit b770283c98e0eee9133c47bc03b6cc625dc94723) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f1f3285642677..74b7951b781b8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4740,6 +4740,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; From 9ffa1d483a1c7cf169438a7c9abb108deb3c6df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:30 +0100 Subject: [PATCH 48/93] netfilter: nf_tables: disallow timeout for anonymous sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-645 cve CVE-2023-52620 commit-author Pablo Neira Ayuso commit e26d3009efda338f19016df4175f354a9bd0a4ab Never used from userspace, disallow these parameters. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit e26d3009efda338f19016df4175f354a9bd0a4ab) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 74b7951b781b8..b4527591dbe85 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4686,6 +4686,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4694,6 +4697,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } From cf5d8f5dbaa6bbfe0fcf8b3d8f570401fe3c107e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:50:39 +0100 Subject: [PATCH 49/93] netfilter: nf_tables: fix underflow in chain reference counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit b389139f12f287b8ed2e2628b72df89a081f0b59 Set element addition error path decrements reference counter on chains twice: once on element release and again via nft_data_release(). Then, d6b478666ffa ("netfilter: nf_tables: fix underflow in object reference counter") incorrectly fixed this by removing the stateful object reference count decrement. Restore the stateful object decrement as in b91d90368837 ("netfilter: nf_tables: fix leaking object reference count") and let nft_data_release() decrement the chain reference counter, so this is done only once. Fixes: d6b478666ffa ("netfilter: nf_tables: fix underflow in object reference counter") Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit b389139f12f287b8ed2e2628b72df89a081f0b59) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b4527591dbe85..943bcc9342ea6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6566,7 +6566,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err_element_clash: kfree(trans); err_elem_free: - nft_set_elem_destroy(set, elem.priv, true); + nf_tables_set_elem_destroy(ctx, set, elem.priv); + if (obj) + obj->use--; err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); From b0c8e943e409740752a9a34c96743088094223e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:51:12 +0100 Subject: [PATCH 50/93] netfilter: nf_tables: report use refcount overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 1689f25924ada8fe14a4a82c38925d04994c7142 upstream-diff Used the cleanly applying 9.4 backport 854ec8345abb60f1fb65446a6aef2627f71196ca Overflow use refcount checks are not complete. Add helper function to deal with object reference counter tracking. Report -EMFILE in case UINT_MAX is reached. nft_use_dec() splats in case that reference counter underflows, which should not ever happen. Add nft_use_inc_restore() and nft_use_dec_restore() which are used to restore reference counter from error and abort paths. Use u32 in nft_flowtable and nft_object since helper functions cannot work on bitfields. Remove the few early incomplete checks now that the helper functions are in place and used to check for refcount overflow. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 1689f25924ada8fe14a4a82c38925d04994c7142) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 31 +++++- net/netfilter/nf_tables_api.c | 163 ++++++++++++++++++------------ net/netfilter/nft_flow_offload.c | 6 +- net/netfilter/nft_immediate.c | 8 +- net/netfilter/nft_objref.c | 8 +- 5 files changed, 141 insertions(+), 75 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ccb3b3e4ce88e..3554c8ea03d3e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1145,6 +1145,29 @@ int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); +static inline bool nft_use_inc(u32 *use) +{ + if (*use == UINT_MAX) + return false; + + (*use)++; + + return true; +} + +static inline void nft_use_dec(u32 *use) +{ + WARN_ON_ONCE((*use)-- == 0); +} + +/* For error and abort path: restore use counter to previous state. */ +static inline void nft_use_inc_restore(u32 *use) +{ + WARN_ON_ONCE(!nft_use_inc(use)); +} + +#define nft_use_dec_restore nft_use_dec + /** * struct nft_table - nf_tables table * @@ -1228,8 +1251,8 @@ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; u16 udlen; u8 *udata; @@ -1331,8 +1354,8 @@ struct nft_flowtable { char *name; int hooknum; int ops_len; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 943bcc9342ea6..56291ca0d6518 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -255,8 +255,10 @@ int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) if (chain->bound) return -EBUSY; + if (!nft_use_inc(&chain->use)) + return -EMFILE; + chain->bound = true; - chain->use++; nft_chain_trans_bind(ctx, chain); return 0; @@ -439,7 +441,7 @@ static int nft_delchain(struct nft_ctx *ctx) if (IS_ERR(trans)) return PTR_ERR(trans); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; @@ -478,7 +480,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; @@ -645,7 +647,7 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -677,7 +679,7 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; nft_deactivate_next(ctx->net, obj); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -712,7 +714,7 @@ static int nft_delflowtable(struct nft_ctx *ctx, return err; nft_deactivate_next(ctx->net, flowtable); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -2298,9 +2300,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_rule **rules; int err; - if (table->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_CHAIN_HOOK]) { struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook; @@ -2397,6 +2396,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err_destroy_chain; + if (!nft_use_inc(&table->use)) { + err = -EMFILE; + goto err_use; + } + trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -2413,10 +2417,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } - table->use++; - return 0; + err_unregister_hook: + nft_use_dec_restore(&table->use); +err_use: nf_tables_unregister_hook(net, table, chain); err_destroy_chain: nf_tables_chain_destroy(ctx); @@ -3616,9 +3621,6 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; handle = nf_tables_alloc_handle(table); - if (chain->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); old_rule = __nft_rule_lookup(chain, pos_handle); @@ -3712,6 +3714,11 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } + if (!nft_use_inc(&chain->use)) { + err = -EMFILE; + goto err_release_rule; + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { err = nft_delrule(&ctx, old_rule); if (err < 0) @@ -3743,7 +3750,6 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } kvfree(expr_info); - chain->use++; if (flow) nft_trans_flow_rule(trans) = flow; @@ -3754,6 +3760,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return 0; err_destroy_flow_rule: + nft_use_dec_restore(&chain->use); if (flow) nft_flow_rule_destroy(flow); err_release_rule: @@ -4786,9 +4793,15 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, alloc_size = sizeof(*set) + size + udlen; if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; + + if (!nft_use_inc(&table->use)) + return -EMFILE; + set = kvzalloc(alloc_size, GFP_KERNEL); - if (!set) - return -ENOMEM; + if (!set) { + err = -ENOMEM; + goto err_alloc; + } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); if (!name) { @@ -4846,7 +4859,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); - table->use++; + return 0; err_set_expr_alloc: @@ -4858,6 +4871,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, kfree(set->name); err_set_name: kvfree(set); +err_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -4996,9 +5012,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *i; struct nft_set_iter iter; - if (set->use == UINT_MAX) - return -EOVERFLOW; - if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -5026,10 +5039,12 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, return iter.err; } bind: + if (!nft_use_inc(&set->use)) + return -EMFILE; + binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); - set->use++; return 0; } @@ -5103,7 +5118,7 @@ void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) nft_clear(ctx->net, set); } - set->use++; + nft_use_inc_restore(&set->use); } EXPORT_SYMBOL_GPL(nf_tables_activate_set); @@ -5119,7 +5134,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, else list_del_rcu(&binding->list); - set->use--; + nft_use_dec(&set->use); break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) { @@ -5128,7 +5143,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, nft_deactivate_next(ctx->net, set); } - set->use--; + nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: @@ -5136,7 +5151,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); - set->use--; + nft_use_dec(&set->use); fallthrough; default: nf_tables_unbind_set(ctx, set, binding, @@ -5927,7 +5942,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -6429,8 +6444,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); + obj = NULL; goto err_parse_key_end; } + + if (!nft_use_inc(&obj->use)) { + err = -EMFILE; + obj = NULL; + goto err_parse_key_end; + } + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); if (err < 0) goto err_parse_key_end; @@ -6499,10 +6522,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; - if (obj) { + if (obj) *nft_set_ext_obj(ext) = obj; - obj->use++; - } + if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; @@ -6567,12 +6589,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, kfree(trans); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); - if (obj) - obj->use--; err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: + if (obj) + nft_use_dec_restore(&obj->use); + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); @@ -6653,7 +6676,7 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use++; + nft_use_inc_restore(&chain->use); break; } } @@ -6668,7 +6691,7 @@ static void nft_setelem_data_activate(const struct net *net, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use++; + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } static void nft_setelem_data_deactivate(const struct net *net, @@ -6680,7 +6703,7 @@ static void nft_setelem_data_deactivate(const struct net *net, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, @@ -7220,9 +7243,14 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + type = nft_obj_type_get(net, objtype); - if (IS_ERR(type)) - return PTR_ERR(type); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err_type; + } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { @@ -7256,7 +7284,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); - table->use++; + return 0; err_obj_ht: /* queued in transaction log */ @@ -7272,6 +7300,9 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, kfree(obj); err_init: module_put(type->owner); +err_type: + nft_use_dec_restore(&table->use); + return err; } @@ -7666,7 +7697,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - flowtable->use--; + nft_use_dec(&flowtable->use); fallthrough; default: return; @@ -8014,9 +8045,14 @@ static int nf_tables_newflowtable(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); - if (!flowtable) - return -ENOMEM; + if (!flowtable) { + err = -ENOMEM; + goto flowtable_alloc; + } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); @@ -8071,7 +8107,6 @@ static int nf_tables_newflowtable(struct sk_buff *skb, goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); - table->use++; return 0; err5: @@ -8088,6 +8123,9 @@ static int nf_tables_newflowtable(struct sk_buff *skb, kfree(flowtable->name); err1: kfree(flowtable); +flowtable_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -9392,7 +9430,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + nft_use_dec(&trans->ctx.table->use); } nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); @@ -9616,7 +9654,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, @@ -9625,7 +9663,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); break; @@ -9634,7 +9672,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.chain->use--; + nft_use_dec_restore(&trans->ctx.chain->use); list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), @@ -9644,7 +9682,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: - trans->ctx.chain->use++; + nft_use_inc_restore(&trans->ctx.chain->use); nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) @@ -9657,7 +9695,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; @@ -9666,7 +9704,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_set(trans)); if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_activate(&trans->ctx, nft_trans_set(trans)); @@ -9710,13 +9748,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; @@ -9725,7 +9763,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); @@ -9737,7 +9775,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); @@ -10181,8 +10219,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (desc->flags & NFT_DATA_DESC_SETELEM && chain->flags & NFT_CHAIN_BINDING) return -EINVAL; + if (!nft_use_inc(&chain->use)) + return -EMFILE; - chain->use++; data->verdict.chain = chain; break; default: @@ -10202,7 +10241,7 @@ static void nft_verdict_uninit(const struct nft_data *data) case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use--; + nft_use_dec(&chain->use); break; } } @@ -10371,11 +10410,11 @@ int __nft_release_basechain(struct nft_ctx *ctx) nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } nft_chain_del(ctx->chain); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nf_tables_chain_destroy(ctx); return 0; @@ -10425,18 +10464,18 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); - chain->use--; + nft_use_dec(&chain->use); nf_tables_rule_release(&ctx, rule); } } list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { list_del(&flowtable->list); - table->use--; + nft_use_dec(&table->use); nf_tables_flowtable_destroy(flowtable); } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); - table->use--; + nft_use_dec(&table->use); if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(&ctx, set); @@ -10444,13 +10483,13 @@ static void __nft_release_table(struct net *net, struct nft_table *table) } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); - table->use--; + nft_use_dec(&table->use); nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { ctx.chain = chain; nft_chain_del(chain); - table->use--; + nft_use_dec(&table->use); nf_tables_chain_destroy(&ctx); } nf_tables_table_destroy(&ctx); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 6db8c802d5e76..9a05fca9c48b7 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -381,8 +381,10 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (IS_ERR(flowtable)) return PTR_ERR(flowtable); + if (!nft_use_inc(&flowtable->use)) + return -EMFILE; + priv->flowtable = flowtable; - flowtable->use++; return nf_ct_netns_get(ctx->net, ctx->family); } @@ -401,7 +403,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use++; + nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 7c810005a1f9f..11a39289fe49b 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -159,7 +159,7 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, default: nft_chain_del(chain); chain->bound = false; - chain->table->use--; + nft_use_dec(&chain->table->use); break; } break; @@ -198,7 +198,7 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, * let the transaction records release this chain and its rules. */ if (chain->bound) { - chain->use--; + nft_use_dec(&chain->use); break; } @@ -206,9 +206,9 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, chain_ctx = *ctx; chain_ctx.chain = chain; - chain->use--; + nft_use_dec(&chain->use); list_for_each_entry_safe(rule, n, &chain->rules, list) { - chain->use--; + nft_use_dec(&chain->use); list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index e873401182899..10850266221a6 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -41,8 +41,10 @@ static int nft_objref_init(const struct nft_ctx *ctx, if (IS_ERR(obj)) return -ENOENT; + if (!nft_use_inc(&obj->use)) + return -EMFILE; + nft_objref_priv(expr) = obj; - obj->use++; return 0; } @@ -72,7 +74,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx, if (phase == NFT_TRANS_COMMIT) return; - obj->use--; + nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, @@ -80,7 +82,7 @@ static void nft_objref_activate(const struct nft_ctx *ctx, { struct nft_object *obj = nft_objref_priv(expr); - obj->use++; + nft_use_inc_restore(&obj->use); } static struct nft_expr_type nft_objref_type; From 1ee761743001e35c82322a59b37978ad6cddfd4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:52:02 +0100 Subject: [PATCH 51/93] netfilter: nf_tables: fix spurious set element insertion failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Florian Westphal commit ddbd8be68941985f166f5107109a90ce13147c44 On some platforms there is a padding hole in the nft_verdict structure, between the verdict code and the chain pointer. On element insertion, if the new element clashes with an existing one and NLM_F_EXCL flag isn't set, we want to ignore the -EEXIST error as long as the data associated with duplicated element is the same as the existing one. The data equality check uses memcmp. For normal data (NFT_DATA_VALUE) this works fine, but for NFT_DATA_VERDICT padding area leads to spurious failure even if the verdict data is the same. This then makes the insertion fail with 'already exists' error, even though the new "key : data" matches an existing entry and userspace told the kernel that it doesn't want to receive an error indication. Fixes: c016c7e45ddf ("netfilter: nf_tables: honor NLM_F_EXCL flag in set element insertion") Signed-off-by: Florian Westphal (cherry picked from commit ddbd8be68941985f166f5107109a90ce13147c44) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 56291ca0d6518..a508db787279d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10183,6 +10183,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; + + /* zero padding hole for memcmp */ + memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { From 53ae443f32bab1449f4d8e2dc0a4d6b9cd7e500e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 20:52:04 +0100 Subject: [PATCH 52/93] netfilter: nf_tables: skip bound chain in netns release path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-pre CVE-2023-4244 commit-author Pablo Neira Ayuso commit 751d460ccff3137212f47d876221534bf0490996 Skip bound chain from netns release path, the rule that owns this chain releases these objects. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 751d460ccff3137212f47d876221534bf0490996) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a508db787279d..ed15cfc0abb70 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10464,6 +10464,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { + if (nft_chain_is_bound(chain)) + continue; + ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); From a25ad63859acb6fff9d4d69f1cad84fb91efec01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:14:14 +0100 Subject: [PATCH 53/93] netfilter: nf_tables: don't skip expired elements during walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-158864 cve CVE-2023-52924 commit-author Florian Westphal commit 24138933b97b055d486e8064b4a1721702442a9b upstream-diff Context conflict with the cve fix 5d4bb57cb9e7703d390f48a0d8dc69cbd45a5804, which also experienced conflict due to this one missing (wrong application order). The end result of them two - the 'nft_set_elem_active' branch present and the 'nft_set_elem_expired' missing - was preserved. There is an asymmetry between commit/abort and preparation phase if the following conditions are met: 1. set is a verdict map ("1.2.3.4 : jump foo") 2. timeouts are enabled In this case, following sequence is problematic: 1. element E in set S refers to chain C 2. userspace requests removal of set S 3. kernel does a set walk to decrement chain->use count for all elements from preparation phase 4. kernel does another set walk to remove elements from the commit phase (or another walk to do a chain->use increment for all elements from abort phase) If E has already expired in 1), it will be ignored during list walk, so its use count won't have been changed. Then, when set is culled, ->destroy callback will zap the element via nf_tables_set_elem_destroy(), but this function is only safe for elements that have been deactivated earlier from the preparation phase: lack of earlier deactivate removes the element but leaks the chain use count, which results in a WARN splat when the chain gets removed later, plus a leak of the nft_chain structure. Update pipapo_get() not to skip expired elements, otherwise flush command reports bogus ENOENT errors. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 24138933b97b055d486e8064b4a1721702442a9b) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 ++++ net/netfilter/nft_set_hash.c | 2 -- net/netfilter/nft_set_pipapo.c | 19 ++++++++++++------- net/netfilter/nft_set_rbtree.c | 2 -- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ed15cfc0abb70..e20d494728e3f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5369,8 +5369,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; + if (nft_set_elem_expired(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0b73cb0e752f7..24caa31fa2310 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -278,8 +278,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eda2071f8d05c..d6e5fef2d085a 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && + if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + struct nft_pipapo_elem *ret; + + ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); + if (IS_ERR(ret)) + return ret; + + if (nft_set_elem_expired(&ret->ext)) + return ERR_PTR(-ENOENT); + + return ret; } /** @@ -2010,9 +2018,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&e->ext, iter->genmask)) goto cont; - if (nft_set_elem_expired(&e->ext)) - goto cont; - elem.priv = e; iter->err = iter->fn(ctx, set, iter, &elem); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5c05c9b990fba..ca4d684ffec4a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -544,8 +544,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; From 66663eb40599b860c1031cc9d655ecdcfe95cc2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:19:57 +0100 Subject: [PATCH 54/93] netfilter: nf_tables: GC transaction API to avoid race with control plane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 5f68718b34a531a556f2f50300ead2862278da26 upstream-diff Used the cleanly applying 9.4 backport bf6d3e9889ced6c5729d979069c7481ae1b38d90 The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 5f68718b34a531a556f2f50300ead2862278da26) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 64 +++++++- net/netfilter/nf_tables_api.c | 248 ++++++++++++++++++++++++++++-- 2 files changed, 300 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3554c8ea03d3e..662238596070c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -475,6 +475,7 @@ struct nft_set_elem_expr { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -504,6 +505,7 @@ struct nft_set_elem_expr { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -525,7 +527,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -1530,6 +1533,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1661,6 +1690,38 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1687,6 +1748,7 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; + unsigned int gc_seq; u8 validate_state; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e20d494728e3f..bb3fe2562e27b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -122,6 +124,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -583,10 +588,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem); - static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -4822,6 +4823,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4889,6 +4891,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, } } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -4901,8 +4911,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, @@ -6045,7 +6054,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && - !nft_set_elem_expired(ext)) + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) return ext; } @@ -6698,9 +6708,9 @@ static void nft_setelem_data_activate(const struct net *net, nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -9107,6 +9117,207 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + nft_setelem_remove(ctx->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, &elem)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9269,11 +9480,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; - unsigned int base_seq; LIST_HEAD(adl); int err; @@ -9350,6 +9561,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -9442,6 +9657,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); @@ -9544,6 +9760,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); + + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -10578,6 +10796,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; + nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; return 0; @@ -10607,10 +10826,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -10682,6 +10907,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); From 1c720353e3073046f685fd53698f4129264f9556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:21:07 +0100 Subject: [PATCH 55/93] netfilter: nft_set_rbtree: fix overlap expiration walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-155012 cve CVE-2023-53304 commit-author Florian Westphal commit f718863aca469a109895cb855e6b81fff4827d71 The lazy gc on insert that should remove timed-out entries fails to release the other half of the interval, if any. Can be reproduced with tests/shell/testcases/sets/0044interval_overlap_0 in nftables.git and kmemleak enabled kernel. Second bug is the use of rbe_prev vs. prev pointer. If rbe_prev() returns NULL after at least one iteration, rbe_prev points to element that is not an end interval, hence it should not be removed. Lastly, check the genmask of the end interval if this is active in the current generation. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal (cherry picked from commit f718863aca469a109895cb855e6b81fff4827d71) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ca4d684ffec4a..39956e5341c9e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -217,29 +217,37 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) + struct nft_rbtree_elem *rbe, + u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); - struct nft_rbtree_elem *rbe_prev = NULL; + struct nft_rbtree_elem *rbe_prev; struct nft_set_gc_batch *gcb; gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); if (!gcb) return -ENOMEM; - /* search for expired end interval coming before this element. */ + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - if (nft_rbtree_interval_end(rbe_prev)) + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, genmask)) break; prev = rb_prev(prev); } - if (rbe_prev) { + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + rb_erase(&rbe_prev->node, &priv->root); atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, rbe_prev); } rb_erase(&rbe->node, &priv->root); @@ -321,7 +329,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* perform garbage collection to avoid bogus overlap reports. */ if (nft_set_elem_expired(&rbe->ext)) { - err = nft_rbtree_gc_elem(set, priv, rbe); + err = nft_rbtree_gc_elem(set, priv, rbe, genmask); if (err < 0) return err; From 37541a7ea34a15e34f2b41340a050c6e6775b513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:39:20 +0100 Subject: [PATCH 56/93] netfilter: nf_tables: skip immediate deactivate in _PREPARE_ERROR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-6613 cve CVE-2023-4015 commit-author Pablo Neira Ayuso commit 0a771f7b266b02d262900c75f1e175c7fe76fec2 On error when building the rule, the immediate expression unbinds the chain, hence objects can be deactivated by the transaction records. Otherwise, it is possible to trigger the following warning: WARNING: CPU: 3 PID: 915 at net/netfilter/nf_tables_api.c:2013 nf_tables_chain_destroy+0x1f7/0x210 [nf_tables] CPU: 3 PID: 915 Comm: chain-bind-err- Not tainted 6.1.39 #1 RIP: 0010:nf_tables_chain_destroy+0x1f7/0x210 [nf_tables] Fixes: 4bedf9eee016 ("netfilter: nf_tables: fix chain binding transaction logic") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 0a771f7b266b02d262900c75f1e175c7fe76fec2) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_immediate.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 11a39289fe49b..5e64c4eb5e96e 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -125,15 +125,27 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, + struct nft_chain *chain, + enum nft_trans_phase phase) +{ + struct nft_ctx chain_ctx; + struct nft_rule *rule; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); +} + static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; - struct nft_ctx chain_ctx; struct nft_chain *chain; - struct nft_rule *rule; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { @@ -143,20 +155,17 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, if (!nft_chain_binding(chain)) break; - chain_ctx = *ctx; - chain_ctx.chain = chain; - - list_for_each_entry(rule, &chain->rules, list) - nft_rule_expr_deactivate(&chain_ctx, rule, phase); - switch (phase) { case NFT_TRANS_PREPARE_ERROR: nf_tables_unbind_chain(ctx, chain); - fallthrough; + nft_deactivate_next(ctx->net, chain); + break; case NFT_TRANS_PREPARE: + nft_immediate_chain_deactivate(ctx, chain, phase); nft_deactivate_next(ctx->net, chain); break; default: + nft_immediate_chain_deactivate(ctx, chain, phase); nft_chain_del(chain); chain->bound = false; nft_use_dec(&chain->table->use); From 893218018f7dc8ddf69f2162e68d2a66be88d0e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:39:50 +0100 Subject: [PATCH 57/93] netfilter: nf_tables: adapt set backend to use GC transaction API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-158865 cve CVE-2023-52923 commit-author Pablo Neira Ayuso commit f6c383b8c31a93752a52697f8430a71dcbc46adf Use the GC transaction API to replace the old and buggy gc API and the busy mark approach. No set elements are removed from async garbage collection anymore, instead the _DEAD bit is set on so the set element is not visible from lookup path anymore. Async GC enqueues transaction work that might be aborted and retried later. rbtree and pipapo set backends does not set on the _DEAD bit from the sync GC path since this runs in control plane path where mutex is held. In this case, set elements are deactivated, removed and then released via RCU callback, sync GC never fails. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit f6c383b8c31a93752a52697f8430a71dcbc46adf) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 7 +- net/netfilter/nft_set_hash.c | 77 +++++++++++------- net/netfilter/nft_set_pipapo.c | 48 ++++++++--- net/netfilter/nft_set_rbtree.c | 144 ++++++++++++++++++++------------- 4 files changed, 173 insertions(+), 103 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bb3fe2562e27b..14ec9ceaf9b1f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6147,7 +6147,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, if (nft_setelem_is_catchall(set, elem)) { nft_set_elem_change_active(net, set, ext); - nft_set_elem_clear_busy(ext); } else { set->ops->activate(net, set, elem); } @@ -6162,8 +6161,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext) || - nft_set_elem_mark_busy(ext)) + if (!nft_is_active(net, ext)) continue; kfree(elem->priv); @@ -6874,8 +6872,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask) || - nft_set_elem_mark_busy(ext)) + if (!nft_set_elem_active(ext, genmask)) continue; elem.priv = catchall->elem; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 24caa31fa2310..2f067e4596b02 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -312,25 +309,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; + if (PTR_ERR(he) != -EAGAIN) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } continue; } + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && nft_rhash_expr_needs_gc_run(set, &he->ext)) goto needs_gc_run; @@ -338,26 +358,26 @@ static void nft_rhash_gc(struct work_struct *work) if (!nft_set_elem_expired(&he->ext)) continue; needs_gc_run: - if (nft_set_elem_mark_busy(&he->ext)) - continue; + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + nft_trans_gc_elem_add(gc, he); } + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - he = nft_set_catchall_gc(set); - if (he) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, he); - } - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -420,7 +440,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)&rhash_ctx); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index d6e5fef2d085a..590aeddcfc86a 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1537,16 +1537,34 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) + +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1570,13 +1588,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { + + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + break; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1586,11 +1611,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - e = nft_set_catchall_gc(set); - if (e) - nft_set_elem_destroy(set, e, true); - - priv->last_gc = jiffies; + gc = nft_trans_gc_catchall(gc, 0); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1715,7 +1740,6 @@ static void nft_pipapo_activate(const struct net *net, return; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); } /** diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 39956e5341c9e..f9d4c8fcbbf82 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); +} + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) @@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe, @@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) return -ENOMEM; /* search for end interval coming before this element. @@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); - rb_erase(&rbe_prev->node, &priv->root); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_prev); + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe_prev); } - rb_erase(&rbe->node, &priv->root); - atomic_dec(&set->nelems); + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe); - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + nft_trans_gc_queue_sync_done(gc); return 0; } @@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -570,26 +596,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct nftables_pernet *nft_net; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; struct net *net; - u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -602,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - } - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - rbe = nft_set_catchall_gc(set); - if (rbe) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, rbe); - } - nft_set_gc_batch_complete(gcb); - + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } From 5176fe9324d34fdee5bf78dd9ab75dc240953623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:43:46 +0100 Subject: [PATCH 58/93] netfilter: nft_set_hash: mark set element as dead when deleting from packet path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit c92db3030492b8ad1d0faace7a93bbcf53850d0c Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of performing element removal which might race with an ongoing transaction. Enable gc when dynamic flag is set on since dynset deletion requires garbage collection after this patch. Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit c92db3030492b8ad1d0faace7a93bbcf53850d0c) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_hash.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 2f067e4596b02..cef5df8460009 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -249,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, @@ -412,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; From ca6e9cf81866a5d3f0ba66f5f76bd232bfced61a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:44:28 +0100 Subject: [PATCH 59/93] netfilter: nf_tables: remove busy mark and gc batch API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 Ditch it, it has been replace it by the GC transaction API and it has no clients anymore. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit a2dd0233cbc4d8a0abb5f64487487ffc9265beb5) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 98 +------------------------------ net/netfilter/nf_tables_api.c | 48 +-------------- 2 files changed, 4 insertions(+), 142 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 662238596070c..7367efa7bd70f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -562,7 +562,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); -void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -779,62 +778,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1498,47 +1441,12 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) - -#if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 -#elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) -#else -#error -#endif - -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -#define NFT_SET_ELEM_DEAD_MASK (1 << 3) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT 3 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 14ec9ceaf9b1f..a606e28a94177 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6063,29 +6063,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); -void *nft_set_catchall_gc(const struct nft_set *set) -{ - struct nft_set_elem_catchall *catchall, *next; - struct nft_set_ext *ext; - void *elem = NULL; - - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - - if (!nft_set_elem_expired(ext) || - nft_set_elem_mark_busy(ext)) - continue; - - elem = catchall->elem; - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); - break; - } - - return elem; -} -EXPORT_SYMBOL_GPL(nft_set_catchall_gc); - static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, @@ -6556,7 +6533,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_free; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { @@ -6946,29 +6923,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} - /* * Stateful objects */ From a0a91f71d3ea17d66e9578aa9cbc5f052a086ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:44:33 +0100 Subject: [PATCH 60/93] netfilter: nf_tables: fix false-positive lockdep splat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit b9f052dc68f69dac89fe1e24693354c033daa091 ->abort invocation may cause splat on debug kernels: WARNING: suspicious RCU usage net/netfilter/nft_set_pipapo.c:1697 suspicious rcu_dereference_check() usage! [..] rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/133554: [..] (nft_net->commit_mutex){+.+.}-{3:3}, at: nf_tables_valid_genid [..] lockdep_rcu_suspicious+0x1ad/0x260 nft_pipapo_abort+0x145/0x180 __nf_tables_abort+0x5359/0x63d0 nf_tables_abort+0x24/0x40 nfnetlink_rcv+0x1a0a/0x22c0 netlink_unicast+0x73c/0x900 netlink_sendmsg+0x7f0/0xc20 ____sys_sendmsg+0x48d/0x760 Transaction mutex is held, so parallel updates are not possible. Switch to _protected and check mutex is held for lockdep enabled builds. Fixes: 212ed75dc5fb ("netfilter: nf_tables: integrate pipapo into commit protocol") Signed-off-by: Florian Westphal (cherry picked from commit b9f052dc68f69dac89fe1e24693354c033daa091) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 590aeddcfc86a..5b8ef50f6dcf1 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1698,6 +1698,17 @@ static void nft_pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); @@ -1706,7 +1717,7 @@ static void nft_pipapo_abort(const struct nft_set *set) if (!priv->dirty) return; - m = rcu_dereference(priv->match); + m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); new_clone = pipapo_clone(m); if (IS_ERR(new_clone)) From fa956eac85fe5e8ad6094e54b803745974207120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:44:48 +0100 Subject: [PATCH 61/93] netfilter: nf_tables: fix kdoc warnings after gc rework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit 08713cb006b6f07434f276c5ee214fb20c7fd965 Jakub Kicinski says: We've got some new kdoc warnings here: net/netfilter/nft_set_pipapo.c:1557: warning: Function parameter or member '_set' not described in 'pipapo_gc' net/netfilter/nft_set_pipapo.c:1557: warning: Excess function parameter 'set' description in 'pipapo_gc' include/net/netfilter/nf_tables.h:577: warning: Function parameter or member 'dead' not described in 'nft_set' Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20230810104638.746e46f1@kernel.org/ Signed-off-by: Florian Westphal (cherry picked from commit 08713cb006b6f07434f276c5ee214fb20c7fd965) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nft_set_pipapo.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7367efa7bd70f..0d1dc61258189 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -497,6 +497,7 @@ struct nft_set_elem_expr { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 5b8ef50f6dcf1..3d8be3ce11fdc 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1550,7 +1550,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) From 136a919f34264f8f7d46471753a8c38e1a2487d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:45:53 +0100 Subject: [PATCH 62/93] netfilter: nf_tables: deactivate catchall elements in next generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cve CVE-2023-4569 commit-author Florian Westphal commit 90e5b3462efa37b8bba82d7c4e63683856e188af When flushing, individual set elements are disabled in the next generation via the ->flush callback. Catchall elements are not disabled. This is incorrect and may lead to double-deactivations of catchall elements which then results in memory leaks: WARNING: CPU: 1 PID: 3300 at include/net/netfilter/nf_tables.h:1172 nft_map_deactivate+0x549/0x730 CPU: 1 PID: 3300 Comm: nft Not tainted 6.5.0-rc5+ #60 RIP: 0010:nft_map_deactivate+0x549/0x730 [..] ? nft_map_deactivate+0x549/0x730 nf_tables_delset+0xb66/0xeb0 (the warn is due to nft_use_dec() detecting underflow). Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Reported-by: lonial con Signed-off-by: Florian Westphal (cherry picked from commit 90e5b3462efa37b8bba82d7c4e63683856e188af) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a606e28a94177..0d21d6c88a1c6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6856,6 +6856,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, ret = __nft_set_catchall_flush(ctx, set, &elem); if (ret < 0) break; + nft_set_elem_change_active(ctx->net, set, ext); } return ret; From 394c480060004b6c89510592cb782781315f3c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:46:50 +0100 Subject: [PATCH 63/93] netfilter: nf_tables: don't fail inserts if duplicate has expired MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-158863 cve CVE-2023-52925 commit-author Florian Westphal commit 7845914f45f066497ac75b30c50dbc735e84e884 nftables selftests fail: run-tests.sh testcases/sets/0044interval_overlap_0 Expected: 0-2 . 0-3, got: W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 Insertion must ignore duplicate but expired entries. Moreover, there is a strange asymmetry in nft_pipapo_activate: It refetches the current element, whereas the other ->activate callbacks (bitmap, hash, rhash, rbtree) use elem->priv. Same for .remove: other set implementations take elem->priv, nft_pipapo_remove fetches elem->priv, then does a relookup, remove this. I suspect this was the reason for the change that prompted the removal of the expired check in pipapo_get() in the first place, but skipping exired elements there makes no sense to me, this helper is used for normal get requests, insertions (duplicate check) and deactivate callback. In first two cases expired elements must be skipped. For ->deactivate(), this gets called for DELSETELEM, so it seems to me that expired elements should be skipped as well, i.e. delete request should fail with -ENOENT error. Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") Signed-off-by: Florian Westphal (cherry picked from commit 7845914f45f066497ac75b30c50dbc735e84e884) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 3d8be3ce11fdc..ae3bba6f2cba4 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext)) + goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - struct nft_pipapo_elem *ret; - - ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + return pipapo_get(net, set, (const u8 *)elem->key.val.data, nft_genmask_cur(net)); - if (IS_ERR(ret)) - return ret; - - if (nft_set_elem_expired(&ret->ext)) - return ERR_PTR(-ENOENT); - - return ret; } /** @@ -1744,11 +1737,7 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = elem->priv; nft_set_elem_change_active(net, set, &e->ext); } @@ -1962,10 +1951,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; From ca02f6eceafabb4fc83af86042eacc609bfb77d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:17 +0100 Subject: [PATCH 64/93] netfilter: nf_tables: fix GC transaction races with netns and netlink event exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc Netlink event path is missing a synchronization point with GC transactions. Add GC sequence number update to netns release path and netlink event path, any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d21d6c88a1c6..df09016ecc1c5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9428,6 +9428,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) } } +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9513,9 +9529,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); - /* Bump gc counter, it becomes odd, this is the busy mark. */ - gc_seq = READ_ONCE(nft_net->gc_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -9713,7 +9727,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + nft_gc_seq_end(nft_net, gc_seq); nf_tables_commit_release(net); return 0; @@ -10699,6 +10713,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, struct net *net = n->net; unsigned int deleted; bool restart = false; + unsigned int gc_seq; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; @@ -10706,6 +10721,9 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, nft_net = nft_pernet(net); deleted = 0; mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + if (!list_empty(&nf_tables_destroy_list)) rcu_barrier(); again: @@ -10728,6 +10746,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, if (restart) goto again; } + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; @@ -10766,12 +10786,20 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); + unsigned int gc_seq; mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + if (!list_empty(&nft_net->commit_list) || !list_empty(&nft_net->module_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); + __nft_release_tables(net); + + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); From 819d0ed978bd966ab401951c910675b376ca8484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:29 +0100 Subject: [PATCH 65/93] netfilter: nf_tables: GC transaction race with netns dismantle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 02c6c24402bf1c1e986899c14ba22a10b510916b Use maybe_get_net() since GC workqueue might race with netns exit path. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 02c6c24402bf1c1e986899c14ba22a10b510916b) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index df09016ecc1c5..45bafaed34c0b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9170,9 +9170,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, if (!trans) return NULL; + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + refcount_inc(&set->refs); trans->set = set; - trans->net = get_net(net); trans->seq = gc_seq; return trans; From f6014c33da8808895e913d0d26d03e9b83a8324a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:35 +0100 Subject: [PATCH 66/93] netfilter: nft_dynset: disallow object maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 23185c6aed1ffb8fc44087880ba2767aba493779 Do not allow to insert elements from datapath to objects maps. Fixes: 8aeff920dcc9 ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 23185c6aed1ffb8fc44087880ba2767aba493779) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_dynset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index c3bd57be2ee88..490e1a26a9c6a 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -191,6 +191,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_OBJECT) + return -EOPNOTSUPP; + if (set->ops->update == NULL) return -EOPNOTSUPP; From 63df5313037e7e0fc8bb42e9cba58e60b094e8b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:41 +0100 Subject: [PATCH 67/93] netfilter: nf_tables: flush pending destroy work before netlink notifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 2c9f0293280e258606e54ed2b96fa71498432eae Destroy work waits for the RCU grace period then it releases the objects with no mutex held. All releases objects follow this path for transactions, therefore, order is guaranteed and references to top-level objects in the hierarchy remain valid. However, netlink notifier might interfer with pending destroy work. rcu_barrier() is not correct because objects are not release via RCU callback. Flush destroy work before releasing objects from netlink notifier path. Fixes: d4bc8271db21 ("netfilter: nf_tables: netlink notifier might race to release objects") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 2c9f0293280e258606e54ed2b96fa71498432eae) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 45bafaed34c0b..2545522adb14e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10730,7 +10730,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); if (!list_empty(&nf_tables_destroy_list)) - rcu_barrier(); + nf_tables_trans_destroy_flush_work(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && From 9ef4dc2d09ef45505acecdb4dd250d8ee9a41315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:44 +0100 Subject: [PATCH 68/93] netfilter: nf_tables: GC transaction race with abort path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 720344340fb9be2765bbaab7b292ece0a4570eae Abort path is missing a synchronization point with GC transactions. Add GC sequence number hence any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 720344340fb9be2765bbaab7b292ece0a4570eae) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2545522adb14e..ac1f46c97ac42 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9998,8 +9998,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); return ret; From 4572356c310679b151fe76416a4997f99b8d08dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:54 +0100 Subject: [PATCH 69/93] netfilter: nf_tables: use correct lock to protect gc_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to protect the gc list. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ac1f46c97ac42..461a31d1d3113 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9146,9 +9146,9 @@ static void nft_trans_gc_work(struct work_struct *work) struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); - spin_lock(&nf_tables_destroy_list_lock); + spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); - spin_unlock(&nf_tables_destroy_list_lock); + spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); From 3313a980222cf7c1fe12b07bcddbf963314c4665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:47:56 +0100 Subject: [PATCH 70/93] netfilter: nf_tables: fix out of memory error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit 5e1be4cdc98c989d5387ce94ff15b5ad06a5b681 Several instances of pipapo_resize() don't propagate allocation failures, this causes a crash when fault injection is enabled for gfp_kernel slabs. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio (cherry picked from commit 5e1be4cdc98c989d5387ce94ff15b5ad06a5b681) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index ae3bba6f2cba4..8ae43164e0b5f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -902,12 +902,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret, bit_offset = 0; + int rule = f->rules, group, ret, bit_offset = 0; - ret = pipapo_resize(f, f->rules - 1, f->rules); + ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; + f->rules++; + for (group = 0; group < f->groups; group++) { int i, v; u8 mask; @@ -1052,7 +1054,9 @@ static int pipapo_expand(struct nft_pipapo_field *f, step++; if (step >= len) { if (!masks) { - pipapo_insert(f, base, 0); + err = pipapo_insert(f, base, 0); + if (err < 0) + return err; masks = 1; } goto out; @@ -1235,6 +1239,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else ret = pipapo_expand(f, start, end, f->groups * f->bb); + if (ret < 0) + return ret; + if (f->bsize > bsize_max) bsize_max = f->bsize; From 9744f3c822e0561629a104020a4fb4f97b96e7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:48:02 +0100 Subject: [PATCH 71/93] netfilter: nf_tables: defer gc run if previous batch is still pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit 8e51830e29e12670b4c10df070a4ea4c9593e961 Don't queue more gc work, else we may queue the same elements multiple times. If an element is flagged as dead, this can mean that either the previous gc request was invalidated/discarded by a transaction or that the previous request is still pending in the system work queue. The latter will happen if the gc interval is set to a very low value, e.g. 1ms, and system work queue is backlogged. The sets refcount is 1 if no previous gc requeusts are queued, so add a helper for this and skip gc run if old requests are pending. Add a helper for this and skip the gc run in this case. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Florian Westphal Reviewed-by: Pablo Neira Ayuso (cherry picked from commit 8e51830e29e12670b4c10df070a4ea4c9593e961) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nft_set_hash.c | 3 +++ net/netfilter/nft_set_rbtree.c | 3 +++ 3 files changed, 11 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0d1dc61258189..ed37e8d2d80b1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -550,6 +550,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index cef5df8460009..524763659f251 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -326,6 +326,9 @@ static void nft_rhash_gc(struct work_struct *work) nft_net = nft_pernet(net); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f9d4c8fcbbf82..c6435e7092319 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -611,6 +611,9 @@ static void nft_rbtree_gc(struct work_struct *work) nft_net = nft_pernet(net); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; From ff501e701a59b4ea291f91088842180010954464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:48:14 +0100 Subject: [PATCH 72/93] netfilter: nft_set_rbtree: skip sync GC for new elements in this transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-158866 cve CVE-2023-52433 commit-author Pablo Neira Ayuso commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4 New elements in this transaction might expired before such transaction ends. Skip sync GC for such elements otherwise commit path might walk over an already released object. Once transaction is finished, async GC will collect such expired element. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index c6435e7092319..f250b5399344a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -312,6 +312,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); int d, err; @@ -357,8 +358,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (!nft_set_elem_active(&rbe->ext, genmask)) continue; - /* perform garbage collection to avoid bogus overlap reports. */ - if (nft_set_elem_expired(&rbe->ext)) { + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { err = nft_rbtree_gc_elem(set, priv, rbe, genmask); if (err < 0) return err; From 89d6ab215ddc672a62310cd585bac60bac74bd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:49:18 +0100 Subject: [PATCH 73/93] netfilter: nf_tables: disallow rule removal from chain binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-34732 cve CVE-2023-5197 commit-author Pablo Neira Ayuso commit f15f29fd4779be8a418b66e9d52979bb6d6c2325 upstream-diff Used the cleanly applying 9.4 backport 29530d262938905b78522612a3acdf63c12820d4 Chain binding only requires the rule addition/insertion command within the same transaction. Removal of rules from chain bindings within the same transaction makes no sense, userspace does not utilize this feature. Replace nft_chain_is_bound() check to nft_chain_binding() in rule deletion commands. Replace command implies a rule deletion, reject this command too. Rule flush command can also safely rely on this nft_chain_binding() check because unbound chains are not allowed since 62e1e94b246e ("netfilter: nf_tables: reject unbound chain set before commit phase"). Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso (cherry picked from commit f15f29fd4779be8a418b66e9d52979bb6d6c2325) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 461a31d1d3113..48283a57af2e0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1371,7 +1371,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -1416,7 +1416,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -2733,6 +2733,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(chain); } + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; + if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; @@ -3721,6 +3724,11 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_chain_binding(chain)) { + err = -EOPNOTSUPP; + goto err_destroy_flow_rule; + } + err = nft_delrule(&ctx, old_rule); if (err < 0) goto err_destroy_flow_rule; @@ -3828,7 +3836,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) return -EOPNOTSUPP; } @@ -3862,7 +3870,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; @@ -10661,7 +10669,7 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; From e7ebc3994570b232cfece86bd518f1f8977f1062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:13 +0100 Subject: [PATCH 74/93] netfilter: nft_set_pipapo: call nft_trans_gc_queue_sync() in catchall GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 4a9e12ea7e70223555ec010bec9f711089ce96f6 pipapo needs to enqueue GC transactions for catchall elements through nft_trans_gc_queue_sync(). Add nft_trans_gc_catchall_sync() and nft_trans_gc_catchall_async() to handle GC transaction queueing accordingly. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 4a9e12ea7e70223555ec010bec9f711089ce96f6) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 5 +++-- net/netfilter/nf_tables_api.c | 22 +++++++++++++++++++--- net/netfilter/nft_set_hash.c | 2 +- net/netfilter/nft_set_pipapo.c | 2 +- net/netfilter/nft_set_rbtree.c | 2 +- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ed37e8d2d80b1..623ab9f9afb2f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1629,8 +1629,9 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); -struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq); +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq); +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 48283a57af2e0..cccc22f66b3aa 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9256,8 +9256,9 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq) +static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq, + bool sync) { struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; @@ -9273,7 +9274,11 @@ struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (sync) + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + else + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) return NULL; @@ -9283,6 +9288,17 @@ struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, return gc; } +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + return nft_trans_gc_catchall(gc, gc_seq, false); +} + +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) +{ + return nft_trans_gc_catchall(gc, 0, true); +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 524763659f251..eca20dc601384 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -372,7 +372,7 @@ static void nft_rhash_gc(struct work_struct *work) nft_trans_gc_elem_add(gc, he); } - gc = nft_trans_gc_catchall(gc, gc_seq); + gc = nft_trans_gc_catchall_async(gc, gc_seq); try_later: /* catchall list iteration requires rcu read side lock. */ diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 8ae43164e0b5f..929edbcbdec47 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1611,7 +1611,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) } } - gc = nft_trans_gc_catchall(gc, 0); + gc = nft_trans_gc_catchall_sync(gc); if (gc) { nft_trans_gc_queue_sync_done(gc); priv->last_gc = jiffies; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f250b5399344a..f100d5ab96762 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -670,7 +670,7 @@ static void nft_rbtree_gc(struct work_struct *work) nft_trans_gc_elem_add(gc, rbe); } - gc = nft_trans_gc_catchall(gc, gc_seq); + gc = nft_trans_gc_catchall_async(gc, gc_seq); try_later: write_seqcount_end(&priv->count); From fd09e77c27efbc816ee89cfe8b48caadb914189e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:21 +0100 Subject: [PATCH 75/93] netfilter: nft_set_rbtree: use read spinlock to avoid datapath contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 96b33300fba880ec0eafcf3d82486f3463b4b6da rbtree GC does not modify the datastructure, instead it collects expired elements and it enqueues a GC transaction. Use a read spinlock instead to avoid data contention while GC worker is running. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 96b33300fba880ec0eafcf3d82486f3463b4b6da) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f100d5ab96762..487572dcd6144 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -622,8 +622,7 @@ static void nft_rbtree_gc(struct work_struct *work) if (!gc) goto done; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { /* Ruleset has been updated, try later. */ @@ -673,8 +672,7 @@ static void nft_rbtree_gc(struct work_struct *work) gc = nft_trans_gc_catchall_async(gc, gc_seq); try_later: - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + read_unlock_bh(&priv->lock); if (gc) nft_trans_gc_queue_async_done(gc); From 7bb17203c3eda715a819b76756ed643c5535e2d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:24 +0100 Subject: [PATCH 76/93] netfilter: nft_set_pipapo: stop GC iteration if GC transaction allocation fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 6d365eabce3c018a80f6e0379b17df2abb17405e nft_trans_gc_queue_sync() enqueues the GC transaction and it allocates a new one. If this allocation fails, then stop this GC sync run and retry later. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 6d365eabce3c018a80f6e0379b17df2abb17405e) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 929edbcbdec47..4e1cc31729b80 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1597,7 +1597,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (!gc) - break; + return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); From b5c6baaa7effab16ce01062a5d437522b34e0ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:26 +0100 Subject: [PATCH 77/93] netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit b079155faae94e9b3ab9337e82100a914ebb4e8d Skip GC run if iterator rewinds to the beginning with EAGAIN, otherwise GC might collect the same element more than once. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit b079155faae94e9b3ab9337e82100a914ebb4e8d) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_hash.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index eca20dc601384..2013de934cef0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -338,12 +338,9 @@ static void nft_rhash_gc(struct work_struct *work) while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } /* Ruleset has been updated, try later. */ From 80b94426191a4377d775ac307a1b3ede639fe344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:32 +0100 Subject: [PATCH 78/93] netfilter: nf_tables: disallow element removal on anonymous sets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 23a3bfd4ba7acd36abf52b78605f61b21bdac216 Anonymous sets need to be populated once at creation and then they are bound to rule since 938154b93be8 ("netfilter: nf_tables: reject unbound anonymous set before commit phase"), otherwise transaction reports EINVAL. Userspace does not need to delete elements of anonymous sets that are not yet bound, reject this with EOPNOTSUPP. From flush command path, skip anonymous sets, they are expected to be bound already. Otherwise, EINVAL is hit at the end of this transaction for unbound sets. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 23a3bfd4ba7acd36abf52b78605f61b21bdac216) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cccc22f66b3aa..dd0794230442d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1385,8 +1385,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -6909,8 +6908,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && - (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); From 29b8b9a29b5264dce370dd93c349d0c4ab56a51d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:35 +0100 Subject: [PATCH 79/93] netfilter: nf_tables: disable toggling dormant table state more than once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit c9bd26513b3a11b3adb3c2ed8a31a01a87173ff1 nft -f -< Cc: Bing-Jhong Billy Jheng Cc: info@starlabs.sg Signed-off-by: Florian Westphal (cherry picked from commit c9bd26513b3a11b3adb3c2ed8a31a01a87173ff1) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dd0794230442d..3e246afc4ec47 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1159,6 +1159,10 @@ static int nf_tables_updtable(struct nft_ctx *ctx) flags & NFT_TABLE_F_OWNER)) return -EOPNOTSUPP; + /* No dormant off/on/off/on games in single transaction */ + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) From 25dfeadee5f01f2162ba010e69a5629598472189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:50:56 +0100 Subject: [PATCH 80/93] netfilter: nf_tables: fix memleak when more than 255 elements expired MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-598 cve CVE-2023-52581 commit-author Florian Westphal commit cf5000a7787cbc10341091d37245a42c119d26c5 When more than 255 elements expired we're supposed to switch to a new gc container structure. This never happens: u8 type will wrap before reaching the boundary and nft_trans_gc_space() always returns true. This means we recycle the initial gc container structure and lose track of the elements that came before. While at it, don't deref 'gc' after we've passed it to call_rcu. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal (cherry picked from commit cf5000a7787cbc10341091d37245a42c119d26c5) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 623ab9f9afb2f..b44263aef4957 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1611,7 +1611,7 @@ struct nft_trans_gc { struct net *net; struct nft_set *set; u32 seq; - u8 count; + u16 count; void *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3e246afc4ec47..9dfb00df50dd6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9218,12 +9218,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { + struct nft_set *set; + if (nft_trans_gc_space(gc)) return gc; + set = gc->set; nft_trans_gc_queue_work(gc); - return nft_trans_gc_alloc(gc->set, gc_seq, gfp); + return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) @@ -9238,15 +9241,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { + struct nft_set *set; + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; + set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); - return nft_trans_gc_alloc(gc->set, 0, gfp); + return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) From 66a1d3cb56fd68c88d71d1450c9fe039e5469026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:51:05 +0100 Subject: [PATCH 81/93] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit 087388278e0f301f4c61ddffb1911d3a180f84b8 nft_rbtree_gc_elem() walks back and removes the end interval element that comes before the expired element. There is a small chance that we've cached this element as 'rbe_ge'. If this happens, we hold and test a pointer that has been queued for freeing. It also causes spurious insertion failures: $ cat test-testcases-sets-0044interval_overlap_0.1/testout.log Error: Could not process rule: File exists add element t s { 0 - 2 } ^^^^^^ Failed to insert 0 - 2 given: table ip t { set s { type inet_service flags interval,timeout timeout 2s gc-interval 2s } } The set (rbtree) is empty. The 'failure' doesn't happen on next attempt. Reason is that when we try to insert, the tree may hold an expired element that collides with the range we're adding. While we do evict/erase this element, we can trip over this check: if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; rbe_ge was erased by the synchronous gc, we should not have done this check. Next attempt won't find it, so retry results in successful insertion. Restart in-kernel to avoid such spurious errors. Such restart are rare, unless userspace intentionally adds very large numbers of elements with very short timeouts while setting a huge gc interval. Even in this case, this cannot loop forever, on each retry an existing element has been removed. As the caller is holding the transaction mutex, its impossible for a second entity to add more expiring elements to the tree. After this it also becomes feasible to remove the async gc worker and perform all garbage collection from the commit path. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal (cherry picked from commit 087388278e0f301f4c61ddffb1911d3a180f84b8) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 46 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 487572dcd6144..2660ceab3759d 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, rb_erase(&rbe->node, &priv->root); } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, - u8 genmask) +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, prev = rb_prev(prev); } + rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_remove(net, set, priv, rbe_prev); @@ -272,7 +272,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } @@ -280,13 +280,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, nft_rbtree_gc_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); - return 0; + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (nft_set_elem_expired(&rbe->ext) && nft_set_elem_active(&rbe->ext, cur_genmask)) { - err = nft_rbtree_gc_elem(set, priv, rbe, genmask); - if (err < 0) - return err; + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } From 1721738075888bd1b8dee5caf83920dfad46b39e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:51:30 +0100 Subject: [PATCH 82/93] netfilter: nf_tables: work around newrule after chain binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Florian Westphal commit d2fd2e46cb93b7661686d29b08c8b6ae15996712 upstream-diff There is no upstream commit to diff with. Picked directly from RH's branch. See message below for details. JIRA: https://issues.redhat.com/browse/RHEL-1720 JIRA: https://issues.redhat.com/browse/RHEL-1721 Upstream Status: RHEL only RHEL only. Proposed upstream but was rejected. I don't think we can force a rebase of nftables userland in RHEL <= 9.4. Even if we can do this, we would still need this change for z-stream. This change SHOULD NOT be forwarded into versions later than RHEL 9.4. For those releases nftables userspace should be updated to release 1.0.7 or later instead. nftables versions prior to commit 3975430b12d9 ("src: expand table command before evaluation"), i.e. 1.0.6 and earlier, will handle the following snippet in the wrong order: table ip t { chain c { jump { counter; } } } 1. create the table, chain,c and an anon chain. 2. append a rule to chain c to jump to the anon chain. 3. append the rule(s) (here: "counter") to the anon chain. (step 3 should be before 2). With below commit, this is now rejected by the kernel. Reason is that the 'jump {' rule added to chain c adds an explicit binding (dependency), i.e. the kernel will automatically remove the anon chain when userspace later asks to delete the 'jump {' rule from chain c. This caused crashes in the kernel in case of a errors further down in the same transaction. The abort part has to unroll all pending changes, including the request to add the rule 'jump {'. As its already bound, all the rules added to it get deleted as well. Because we tolerated late-add-after-bind, the transaction log also contains the NEWRULE requests (here "counter"), so those were deleted again. Instead of rejecting newrule-to-bound-chain, allow it iff the anon chain is new in this transaction and we are appending. Mark the newrule transaction as already_bound so abort path skips them. Fixes: 0ebc1064e487 ("netfilter: nf_tables: disallow rule addition to bound chain via NFTA_RULE_CHAIN_ID") Reported-by: Timo Sigurdsson Closes: https://lore.kernel.org/netfilter-devel/20230911213750.5B4B663206F5@dd20004.kasserver.com/ Signed-off-by: Florian Westphal Signed-off-by: Florian Westphal (cherry picked from commit d2fd2e46cb93b7661686d29b08c8b6ae15996712) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9dfb00df50dd6..4ff7355bce474 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3551,6 +3551,26 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); +/* nft <= 1.0.6 appends rules to anon chains after they have been bound */ +static bool nft_rule_work_around_old_version(const struct nfnl_info *info, + struct nft_chain *chain) +{ + /* bound (anonymous) chain is already used */ + if (nft_is_active(info->net, chain)) + return false; + + /* nft never asks to replace rules here */ + if (info->nlh->nlmsg_flags & (NLM_F_REPLACE | NLM_F_EXCL)) + return false; + + /* nft and it only ever appends. */ + if ((info->nlh->nlmsg_flags & NLM_F_APPEND) == 0) + return false; + + pr_warn_once("enabling workaround for nftables 1.0.6 and older\n"); + return true; +} + #define NFT_RULE_MAXEXPRS 128 static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, @@ -3564,6 +3584,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, struct nft_expr_info *expr_info = NULL; u8 family = info->nfmsg->nfgen_family; struct nft_flow_rule *flow = NULL; + bool add_after_bind = false; struct net *net = info->net; struct nft_userdata *udata; struct nft_table *table; @@ -3603,8 +3624,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; } - if (nft_chain_is_bound(chain)) - return -EOPNOTSUPP; + if (nft_chain_is_bound(chain)) { + if (!nft_rule_work_around_old_version(info, chain)) + return -EOPNOTSUPP; + + add_after_bind = true; + } if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); @@ -3749,6 +3774,9 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, goto err_destroy_flow_rule; } + if (add_after_bind) + nft_trans_rule_bound(trans) = true; + if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); From 7c0a2d993e49959a086c422781486567d3efa54c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:55:45 +0100 Subject: [PATCH 83/93] netfilter: nft_set_pipapo: no need to call pipapo_deactivate() from flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 26cec9d4144eb23c45cd5c033d5c141f04d61a9c Use the element object that is already offered instead. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 26cec9d4144eb23c45cd5c033d5c141f04d61a9c) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_pipapo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4e1cc31729b80..63e82a283d8be 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1816,8 +1816,9 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, { struct nft_pipapo_elem *e = elem; - return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), - &e->ext); + nft_set_elem_change_active(net, set, &e->ext); + + return true; } /** From f21727c8efaf0d3f5eb3f2293c2bec67b323b733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 21:55:51 +0100 Subject: [PATCH 84/93] netfilter: nf_tables: set backend .flush always succeeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 6509a2e410c3cb36c78a0a85c6102debe171337e .flush is always successful since this results from iterating over the set elements to toggle mark the element as inactive in the next generation. Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 6509a2e410c3cb36c78a0a85c6102debe171337e) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 9 +-------- net/netfilter/nft_set_bitmap.c | 4 +--- net/netfilter/nft_set_hash.c | 7 ++----- net/netfilter/nft_set_pipapo.c | 4 +--- net/netfilter/nft_set_rbtree.c | 4 +--- 6 files changed, 7 insertions(+), 23 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b44263aef4957..1a70b72c863cc 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -412,7 +412,7 @@ struct nft_set_ops { void * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); - bool (*flush)(const struct net *net, + void (*flush)(const struct net *net, const struct nft_set *set, void *priv); void (*remove)(const struct net *net, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4ff7355bce474..7341859af6c80 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6834,17 +6834,13 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, struct nft_set_elem *elem) { struct nft_trans *trans; - int err; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) return -ENOMEM; - if (!set->ops->flush(ctx->net, set, elem->priv)) { - err = -ENOENT; - goto err1; - } + set->ops->flush(ctx->net, set, elem->priv); set->ndeact++; nft_setelem_data_deactivate(ctx->net, set, elem); @@ -6853,9 +6849,6 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_trans_commit_list_add_tail(ctx->net, trans); return 0; -err1: - kfree(trans); - return err; } static int __nft_set_catchall_flush(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 60122539fee67..707677ba3ee60 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -174,7 +174,7 @@ static void nft_bitmap_activate(const struct net *net, nft_set_elem_change_active(net, set, &be->ext); } -static bool nft_bitmap_flush(const struct net *net, +static void nft_bitmap_flush(const struct net *net, const struct nft_set *set, void *_be) { struct nft_bitmap *priv = nft_set_priv(set); @@ -186,8 +186,6 @@ static bool nft_bitmap_flush(const struct net *net, /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - - return true; } static void *nft_bitmap_deactivate(const struct net *net, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 2013de934cef0..e758b887ad863 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -192,14 +192,12 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, nft_set_elem_change_active(net, set, &he->ext); } -static bool nft_rhash_flush(const struct net *net, +static void nft_rhash_flush(const struct net *net, const struct nft_set *set, void *priv) { struct nft_rhash_elem *he = priv; nft_set_elem_change_active(net, set, &he->ext); - - return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -590,13 +588,12 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, nft_set_elem_change_active(net, set, &he->ext); } -static bool nft_hash_flush(const struct net *net, +static void nft_hash_flush(const struct net *net, const struct nft_set *set, void *priv) { struct nft_hash_elem *he = priv; nft_set_elem_change_active(net, set, &he->ext); - return true; } static void *nft_hash_deactivate(const struct net *net, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 63e82a283d8be..562c9da15cdcc 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1811,14 +1811,12 @@ static void *nft_pipapo_deactivate(const struct net *net, * * Return: true if element was found and deactivated. */ -static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, +static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, void *elem) { struct nft_pipapo_elem *e = elem; nft_set_elem_change_active(net, set, &e->ext); - - return true; } /** diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2660ceab3759d..6195c87763a8c 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -530,14 +530,12 @@ static void nft_rbtree_activate(const struct net *net, nft_set_elem_change_active(net, set, &rbe->ext); } -static bool nft_rbtree_flush(const struct net *net, +static void nft_rbtree_flush(const struct net *net, const struct nft_set *set, void *priv) { struct nft_rbtree_elem *rbe = priv; nft_set_elem_change_active(net, set, &rbe->ext); - - return true; } static void *nft_rbtree_deactivate(const struct net *net, From a35dbfa4febc6a82578a6b27d5f0aecf521905aa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Nov 2023 11:17:11 +0100 Subject: [PATCH 85/93] netfilter: nf_tables: expose opaque set element as struct nft_elem_priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 9dad402b89e81a0516bad5e0ac009b7a0a80898f upstream-diff Context conflict with the cve fix 5d4bb57cb9e7703d390f48a0d8dc69cbd45a5804 (wrong application order). Add placeholder structure and place it at the beginning of each struct nft_*_elem for each existing set backend, instead of exposing elements as void type to the frontend which defeats compiler type checks. Use this pointer to this new type to replace void *. This patch updates the following set backend API to use this new struct nft_elem_priv placeholder structure: - update - deactivate - flush - get as well as the following helper functions: - nft_set_elem_ext() - nft_set_elem_init() - nft_set_elem_destroy() - nf_tables_set_elem_destroy() This patch adds nft_elem_priv_cast() to cast struct nft_elem_priv to native element representation from the corresponding set backend. BUILD_BUG_ON() makes sure this .priv placeholder is always at the top of the opaque set element representation. Suggested-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 9dad402b89e81a0516bad5e0ac009b7a0a80898f) Signed-off-by: Marcin Wcisło --- include/net/netfilter/nf_tables.h | 38 ++++++++++----- net/netfilter/nf_tables_api.c | 27 ++++++----- net/netfilter/nft_dynset.c | 23 ++++----- net/netfilter/nft_set_bitmap.c | 35 ++++++++------ net/netfilter/nft_set_hash.c | 80 ++++++++++++++++++------------- net/netfilter/nft_set_pipapo.c | 41 ++++++++++------ net/netfilter/nft_set_pipapo.h | 4 +- net/netfilter/nft_set_rbtree.c | 46 ++++++++++-------- 8 files changed, 173 insertions(+), 121 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1a70b72c863cc..a4fc4322f3f24 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -243,6 +243,9 @@ struct nft_userdata { unsigned char data[]; }; +/* placeholder structure for opaque set element backend representation. */ +struct nft_elem_priv { }; + /** * struct nft_set_elem - generic representation of set elements * @@ -263,9 +266,14 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; - void *priv; + struct nft_elem_priv *priv; }; +static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) +{ + return (void *)priv; +} + struct nft_set; struct nft_set_iter { u8 genmask; @@ -393,7 +401,8 @@ struct nft_set_ops { const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, + struct nft_elem_priv * + (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, @@ -409,19 +418,19 @@ struct nft_set_ops { void (*activate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); - void * (*deactivate)(const struct net *net, + struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, - void *priv); + struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); - void * (*get)(const struct net *net, + struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); @@ -759,9 +768,9 @@ static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, - void *elem) + const struct nft_elem_priv *elem_priv) { - return elem + set->ops->elemsize; + return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) @@ -773,16 +782,19 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp); +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem); + const struct nft_set *set, + const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7341859af6c80..34e2b3a717b57 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -601,7 +601,7 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set_elem_catchall { struct list_head list; struct rcu_head rcu; - void *elem; + struct nft_elem_priv *elem; }; static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, @@ -5910,10 +5910,11 @@ static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, return 0; } -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, - const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -5978,10 +5979,11 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } /* Drop references and destroy. Called from gc, dynset and abort path. */ -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_ctx ctx = { .net = read_pnet(&set->net), .family = set->table->family, @@ -5992,10 +5994,10 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); - kfree(elem); + + kfree(elem_priv); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -6003,14 +6005,15 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy); * path via nft_setelem_data_deactivate(). */ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) + const struct nft_set *set, + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); - kfree(elem); + kfree(elem_priv); } int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 490e1a26a9c6a..12a03c25629c1 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,33 +44,34 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, return 0; } -static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, - struct nft_regs *regs) +static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; + void *elem_priv; u64 timeout; - void *elem; if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; timeout = priv->timeout ? : set->timeout; - elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], NULL, - ®s->data[priv->sreg_data], - timeout, 0, GFP_ATOMIC); - if (IS_ERR(elem)) + elem_priv = nft_set_elem_init(set, &priv->tmpl, + ®s->data[priv->sreg_key], NULL, + ®s->data[priv->sreg_data], + timeout, 0, GFP_ATOMIC); + if (IS_ERR(elem_priv)) goto err1; - ext = nft_set_elem_ext(set, elem); + ext = nft_set_elem_ext(set, elem_priv); if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; - return elem; + return elem_priv; err2: - nft_set_elem_destroy(set, elem, false); + nft_set_elem_destroy(set, elem_priv, false); err1: if (set->size) atomic_dec(&set->nelems); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 707677ba3ee60..30213bcfce648 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -13,6 +13,7 @@ #include struct nft_bitmap_elem { + struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; @@ -104,8 +105,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } -static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -116,7 +118,7 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, !nft_set_elem_active(&be->ext, genmask)) continue; - return be; + return &be->priv; } return ERR_PTR(-ENOENT); } @@ -125,8 +127,8 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) { + struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -148,8 +150,8 @@ static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -163,8 +165,8 @@ static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem->priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -175,11 +177,12 @@ static void nft_bitmap_activate(const struct net *net, } static void nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *_be) + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_bitmap_elem *be = _be; u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); @@ -188,12 +191,12 @@ static void nft_bitmap_flush(const struct net *net, nft_set_elem_change_active(net, set, &be->ext); } -static void *nft_bitmap_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -207,7 +210,7 @@ static void *nft_bitmap_deactivate(const struct net *net, priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - return be; + return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -224,7 +227,7 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, if (!nft_set_elem_active(&be->ext, iter->genmask)) goto cont; - elem.priv = be; + elem.priv = &be->priv; iter->err = iter->fn(ctx, set, iter, &elem); @@ -263,6 +266,8 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); @@ -276,7 +281,7 @@ static void nft_bitmap_destroy(const struct nft_ctx *ctx, struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nf_tables_set_elem_destroy(ctx, set, be); + nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index e758b887ad863..0691565caa81b 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -27,6 +27,7 @@ struct nft_rhash { }; struct nft_rhash_elem { + struct nft_elem_priv priv; struct rhash_head node; struct nft_set_ext ext; }; @@ -95,8 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, return !!he; } -static void *nft_rhash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -108,13 +110,14 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - return he; + return &he->priv; return ERR_PTR(-ENOENT); } static bool nft_rhash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, + struct nft_elem_priv * + (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *regs), const struct nft_expr *expr, @@ -123,6 +126,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; + struct nft_elem_priv *elem_priv; struct nft_rhash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, @@ -133,10 +137,11 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, if (he != NULL) goto out; - he = new(set, expr, regs); - if (he == NULL) + elem_priv = new(set, expr, regs); + if (!elem_priv) goto err1; + he = nft_elem_priv_cast(elem_priv); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -144,7 +149,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); he = prev; } @@ -154,7 +159,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, return true; err2: - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); err1: return false; @@ -164,8 +169,8 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, @@ -187,22 +192,23 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, static void nft_rhash_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_rhash_elem *he = elem->priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); nft_set_elem_change_active(net, set, &he->ext); } static void nft_rhash_flush(const struct net *net, - const struct nft_set *set, void *priv) + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); } -static void *nft_rhash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rhash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -219,15 +225,15 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_unlock(); - return he; + return &he->priv; } static void nft_rhash_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } @@ -278,7 +284,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; - elem.priv = he; + elem.priv = &he->priv; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) @@ -404,6 +410,8 @@ static int nft_rhash_init(const struct nft_set *set, struct rhashtable_params params = nft_rhash_params; int err; + BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0); + params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT; params.key_len = set->klen; @@ -426,8 +434,9 @@ struct nft_rhash_ctx { static void nft_rhash_elem_destroy(void *ptr, void *arg) { struct nft_rhash_ctx *rhash_ctx = arg; + struct nft_rhash_elem *he = ptr; - nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv); } static void nft_rhash_destroy(const struct nft_ctx *ctx, @@ -474,6 +483,7 @@ struct nft_hash { }; struct nft_hash_elem { + struct nft_elem_priv priv; struct hlist_node node; struct nft_set_ext ext; }; @@ -499,8 +509,9 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set, return false; } -static void *nft_hash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -512,7 +523,7 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && nft_set_elem_active(&he->ext, genmask)) - return he; + return &he->priv; } return ERR_PTR(-ENOENT); } @@ -562,7 +573,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) { - struct nft_hash_elem *this = elem->priv, *he; + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 hash; @@ -583,25 +594,26 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, static void nft_hash_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv); nft_set_elem_change_active(net, set, &he->ext); } static void nft_hash_flush(const struct net *net, - const struct nft_set *set, void *priv) + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); } -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_hash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *this = elem->priv, *he; u8 genmask = nft_genmask_next(net); u32 hash; @@ -611,7 +623,7 @@ static void *nft_hash_deactivate(const struct net *net, set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); - return he; + return &he->priv; } } return NULL; @@ -621,7 +633,7 @@ static void nft_hash_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem->priv); hlist_del_rcu(&he->node); } @@ -641,7 +653,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; - elem.priv = he; + elem.priv = &he->priv; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) @@ -682,7 +694,7 @@ static void nft_hash_destroy(const struct nft_ctx *ctx, for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nf_tables_set_elem_destroy(ctx, set, he); + nf_tables_set_elem_destroy(ctx, set, &he->priv); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 562c9da15cdcc..c4da1a1c1a1bf 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -599,11 +599,18 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, * @elem: nftables API element representation containing key data * @flags: Unused */ -static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + static struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); + if (IS_ERR(e)) + return ERR_CAST(e); + + return &e->priv; } /** @@ -1162,10 +1169,10 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo_elem *e = elem->priv, *dup; struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; u8 genmask = nft_genmask_next(net); + struct nft_pipapo_elem *e, *dup; struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; @@ -1263,6 +1270,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, put_cpu_ptr(m->scratch); } + e = nft_elem_priv_cast(elem->priv); *ext2 = &e->ext; pipapo_map(m, rulemap, e); @@ -1542,7 +1550,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, { struct nft_set_elem elem = { - .priv = e, + .priv = &e->priv, }; nft_setelem_data_deactivate(net, set, &elem); @@ -1744,7 +1752,7 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e = elem->priv; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem->priv); nft_set_elem_change_active(net, set, &e->ext); } @@ -1784,9 +1792,9 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, * * Return: deactivated element if found, NULL otherwise. */ -static void *nft_pipapo_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -1812,9 +1820,9 @@ static void *nft_pipapo_deactivate(const struct net *net, * Return: true if element was found and deactivated. */ static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, - void *elem) + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e = elem; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &e->ext); } @@ -1951,10 +1959,11 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; - struct nft_pipapo_elem *e = elem->priv; int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; const u8 *data; + e = nft_elem_priv_cast(elem->priv); data = (const u8 *)nft_set_ext_key(&e->ext); while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { @@ -2044,7 +2053,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&e->ext, iter->genmask)) goto cont; - elem.priv = e; + elem.priv = &e->priv; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) @@ -2118,6 +2127,8 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo_field *f; int err, i, field_count; + BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); + field_count = desc->field_count ? : 1; if (field_count > NFT_PIPAPO_MAX_FIELDS) @@ -2213,7 +2224,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, e = f->mt[r].e; - nf_tables_set_elem_destroy(ctx, set, e); + nf_tables_set_elem_destroy(ctx, set, &e->priv); } } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 25a75591583eb..2a64b416a2306 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -170,10 +170,12 @@ struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element + * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { - struct nft_set_ext ext; + struct nft_elem_priv priv; + struct nft_set_ext ext; }; int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6195c87763a8c..f5040bc557b08 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -23,6 +23,7 @@ struct nft_rbtree { }; struct nft_rbtree_elem { + struct nft_elem_priv priv; struct rb_node node; struct nft_set_ext ext; }; @@ -197,8 +198,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, return false; } -static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); @@ -209,16 +211,17 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); if (ret || !read_seqcount_retry(&priv->count, seq)) - return rbe; + return &rbe->priv; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (!ret) - rbe = ERR_PTR(-ENOENT); read_unlock_bh(&priv->lock); - return rbe; + if (!ret) + return ERR_PTR(-ENOENT); + + return &rbe->priv; } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, @@ -226,7 +229,7 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, struct nft_rbtree_elem *rbe) { struct nft_set_elem elem = { - .priv = rbe, + .priv = &rbe->priv, }; nft_setelem_data_deactivate(net, set, &elem); @@ -487,8 +490,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; int err; do { @@ -511,8 +514,8 @@ static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); @@ -525,26 +528,27 @@ static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_rbtree_elem *rbe = elem->priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); nft_set_elem_change_active(net, set, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, - const struct nft_set *set, void *priv) + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_next(net); int d; @@ -570,8 +574,8 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_left; continue; } - nft_rbtree_flush(net, set, rbe); - return rbe; + nft_rbtree_flush(net, set, &rbe->priv); + return &rbe->priv; } } return NULL; @@ -595,7 +599,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; - elem.priv = rbe; + elem.priv = &rbe->priv; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) { @@ -703,6 +707,8 @@ static int nft_rbtree_init(const struct nft_set *set, { struct nft_rbtree *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); + rwlock_init(&priv->lock); seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; @@ -727,7 +733,7 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx, while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nf_tables_set_elem_destroy(ctx, set, rbe); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } } From 1b3a51cda6b2809e383eae05c9a8ce68666e3ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:14:07 +0100 Subject: [PATCH 86/93] netfilter: nf_tables: remove catchall element in GC sync path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cve CVE-2023-6111 commit-author Pablo Neira Ayuso commit 93995bf4af2c5a99e2a87f0cd5ce547d31eb7630 The expired catchall element is not deactivated and removed from GC sync path. This path holds mutex so just call nft_setelem_data_deactivate() and nft_setelem_catchall_remove() before queueing the GC work. Fixes: 4a9e12ea7e70 ("netfilter: nft_set_pipapo: call nft_trans_gc_queue_sync() in catchall GC") Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 93995bf4af2c5a99e2a87f0cd5ce547d31eb7630) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 34e2b3a717b57..a5d755a0cf285 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6223,6 +6223,12 @@ static int nft_setelem_deactivate(const struct net *net, return ret; } +static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) +{ + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); +} + static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) @@ -6231,8 +6237,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem->priv) { - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); + nft_setelem_catchall_destroy(catchall); break; } } @@ -9295,11 +9300,12 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, unsigned int gc_seq, bool sync) { - struct nft_set_elem_catchall *catchall; + struct nft_set_elem_catchall *catchall, *next; const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9317,7 +9323,17 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, if (!gc) return NULL; - nft_trans_gc_elem_add(gc, catchall->elem); + elem_priv = catchall->elem; + if (sync) { + struct nft_set_elem elem = { + .priv = elem_priv, + }; + + nft_setelem_data_deactivate(gc->net, gc->set, &elem); + nft_setelem_catchall_destroy(catchall); + } + + nft_trans_gc_elem_add(gc, elem_priv); } return gc; From e774e891da6d616d4a44c9a7b56f26c34ce187ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:16:10 +0100 Subject: [PATCH 87/93] netfilter: nf_tables: split async and sync catchall in two functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve CVE-2023-4244 commit-author Pablo Neira Ayuso commit 8837ba3e58ea1e3d09ae36db80b1e80853aada95 list_for_each_entry_safe() does not work for the async case which runs under RCU, therefore, split GC logic for catchall in two functions instead, one for each of the sync and async GC variants. The catchall sync GC variant never sees a _DEAD bit set on ever, thus, this handling is removed in such case, moreover, allocate GC sync batch via GFP_KERNEL. Fixes: 93995bf4af2c ("netfilter: nf_tables: remove catchall element in GC sync path") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 8837ba3e58ea1e3d09ae36db80b1e80853aada95) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 62 +++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a5d755a0cf285..2d16e07e19178 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9296,16 +9296,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq, - bool sync) +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) { - struct nft_set_elem_catchall *catchall, *next; + struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; - struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9315,41 +9313,49 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - if (sync) - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - else - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) +{ + struct nft_set_elem_catchall *catchall, *next; + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return NULL; elem_priv = catchall->elem; - if (sync) { - struct nft_set_elem elem = { - .priv = elem_priv, - }; - nft_setelem_data_deactivate(gc->net, gc->set, &elem); - nft_setelem_catchall_destroy(catchall); - } + memset(&elem, 0, sizeof(elem)); + elem.priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, &elem); + nft_setelem_catchall_destroy(catchall); nft_trans_gc_elem_add(gc, elem_priv); } return gc; } -struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, - unsigned int gc_seq) -{ - return nft_trans_gc_catchall(gc, gc_seq, false); -} - -struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) -{ - return nft_trans_gc_catchall(gc, 0, true); -} - static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); From d9e0b4869cf12e5a6064289608e85612eb3f9423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:20:22 +0100 Subject: [PATCH 88/93] netfilter, bpf: Adjust timeouts of non-confirmed CTs in bpf_ct_insert_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-430 cve-bf CVE-2023-4244 commit-author Ilya Leoshkevich commit 837723b22a63cfbff584655b009b9d488d0e9087 bpf_nf testcase fails on s390x: bpf_skb_ct_lookup() cannot find the entry that was added by bpf_ct_insert_entry() within the same BPF function. The reason is that this entry is deleted by nf_ct_gc_expired(). The CT timeout starts ticking after the CT confirmation; therefore nf_conn.timeout is initially set to the timeout value, and __nf_conntrack_confirm() sets it to the deadline value. bpf_ct_insert_entry() sets IPS_CONFIRMED_BIT, but does not adjust the timeout, making its value meaningless and causing false positives. Fix the problem by making bpf_ct_insert_entry() adjust the timeout, like __nf_conntrack_confirm(). Fixes: 2cdaa3eefed8 ("netfilter: conntrack: restore IPS_CONFIRMED out of nf_conntrack_hash_check_insert()") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Cc: Florian Westphal Link: https://lore.kernel.org/bpf/20230830011128.1415752-3-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov (cherry picked from commit 837723b22a63cfbff584655b009b9d488d0e9087) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_conntrack_bpf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 2bc6b9bfbf95e..c4108e3ce637b 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -339,6 +339,8 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + if (!nf_ct_is_confirmed(nfct)) + nfct->timeout += nfct_time_stamp; nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { From b301e6dcf14dc3496a3ee4c474d2ce48a9a8b3c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:21:21 +0100 Subject: [PATCH 89/93] netfilter: ipset: Fix race between IPSET_CMD_CREATE and IPSET_CMD_SWAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-34700 cve CVE-2023-42756 commit-author Jozsef Kadlecsik commit 7433b6d2afd512d04398c73aa984d1e285be125b Kyle Zeng reported that there is a race between IPSET_CMD_ADD and IPSET_CMD_SWAP in netfilter/ip_set, which can lead to the invocation of `__ip_set_put` on a wrong `set`, triggering the `BUG_ON(set->ref == 0);` check in it. The race is caused by using the wrong reference counter, i.e. the ref counter instead of ref_netlink. Fixes: 24e227896bbf ("netfilter: ipset: Add schedule point in call_ad().") Reported-by: Kyle Zeng Closes: https://lore.kernel.org/netfilter-devel/ZPZqetxOmH+w%2Fmyc@westworld/#r Tested-by: Kyle Zeng Signed-off-by: Jozsef Kadlecsik Signed-off-by: Florian Westphal (cherry picked from commit 7433b6d2afd512d04398c73aa984d1e285be125b) Signed-off-by: Marcin Wcisło --- net/netfilter/ipset/ip_set_core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index b225e2752169f..a3e380050ebe7 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -684,6 +684,14 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ +static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + static void __ip_set_put_netlink(struct ip_set *set) { @@ -1699,11 +1707,11 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, do { if (retried) { - __ip_set_get(set); + __ip_set_get_netlink(set); nfnl_unlock(NFNL_SUBSYS_IPSET); cond_resched(); nfnl_lock(NFNL_SUBSYS_IPSET); - __ip_set_put(set); + __ip_set_put_netlink(set); } ip_set_lock(set); From 45724b2b5e9272095883e35432b8b0c56913fd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:22:26 +0100 Subject: [PATCH 90/93] netfilter: nft_set_rbtree: skip end interval element from gc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-8184 cve CVE-2024-26581 commit-author Pablo Neira Ayuso commit 60c0c230c6f046da536d3df8b39a20b9a9fd6af0 rbtree lazy gc on insert might collect an end interval element that has been just added in this transactions, skip end interval elements that are not yet active. Fixes: f718863aca46 ("netfilter: nft_set_rbtree: fix overlap expiration walk") Cc: stable@vger.kernel.org Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 60c0c230c6f046da536d3df8b39a20b9a9fd6af0) Signed-off-by: Marcin Wcisło --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f5040bc557b08..7567a96b3304a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -238,7 +238,7 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, u8 genmask) + struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -257,7 +257,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && - nft_set_elem_active(&rbe_prev->ext, genmask)) + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); @@ -368,7 +368,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; - removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); From 46a1bbbb6461e7534d2bac31657c9e6c02430f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Thu, 6 Nov 2025 21:31:36 +0100 Subject: [PATCH 91/93] netfilter: nf_tables: mark set as dead when unbinding anonymous set with timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-836 cve CVE-2024-26643 commit-author Pablo Neira Ayuso commit 552705a3650bbf46a22b1adedc1b04181490fc36 While the rhashtable set gc runs asynchronously, a race allows it to collect elements from anonymous sets with timeouts while it is being released from the commit path. Mingi Cho originally reported this issue in a different path in 6.1.x with a pipapo set with low timeouts which is not possible upstream since 7395dfacfff6 ("netfilter: nf_tables: use timestamp to check for set element timeout"). Fix this by setting on the dead flag for anonymous sets to skip async gc in this case. According to 08e4c8c5919f ("netfilter: nf_tables: mark newset as dead on transaction abort"), Florian plans to accelerate abort path by releasing objects via workqueue, therefore, this sets on the dead flag for abort path too. Cc: stable@vger.kernel.org Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Mingi Cho Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 552705a3650bbf46a22b1adedc1b04181490fc36) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2d16e07e19178..a1ca8653c2050 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5105,6 +5105,7 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); From 6e190e59791c4ce28c0bc0bae5653b5709bb37b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:24:33 +0100 Subject: [PATCH 92/93] netfilter: nf_tables: release batch on table validation from abort path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-4906 cve-pre CVE-2024-26925 commit-author Pablo Neira Ayuso commit a45e6889575c2067d3c0212b6bc1022891e65b91 Unlike early commit path stage which triggers a call to abort, an explicit release of the batch is required on abort, otherwise mutex is released and commit_list remains in place. Add WARN_ON_ONCE to ensure commit_list is empty from the abort path before releasing the mutex. After this patch, commit_list is always assumed to be empty before grabbing the mutex, therefore 03c1f1ef1584 ("netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()") only needs to release the pending modules for registration. Cc: stable@vger.kernel.org Fixes: c0391b6ab810 ("netfilter: nf_tables: missing validation from the abort path") Signed-off-by: Pablo Neira Ayuso (cherry picked from commit a45e6889575c2067d3c0212b6bc1022891e65b91) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a1ca8653c2050..7a9559b993d65 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9888,10 +9888,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { @@ -10073,7 +10074,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) else nf_tables_module_autoload_cleanup(net); - return 0; + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, @@ -10086,6 +10087,9 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -10883,9 +10887,10 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); __nft_release_tables(net); From 520390b3b9df5884c60e883e1f4faaf2af9cd9c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 4 Nov 2025 22:25:48 +0100 Subject: [PATCH 93/93] netfilter: nf_tables: release mutex after nft_gc_seq_end from abort path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-4906 cve CVE-2024-26925 commit-author Pablo Neira Ayuso commit 0d459e2ffb541841714839e8228b845458ed3b27 The commit mutex should not be released during the critical section between nft_gc_seq_begin() and nft_gc_seq_end(), otherwise, async GC worker could collect expired objects and get the released commit lock within the same GC sequence. nf_tables_module_autoload() temporarily releases the mutex to load module dependencies, then it goes back to replay the transaction again. Move it at the end of the abort phase after nft_gc_seq_end() is called. Cc: stable@vger.kernel.org Fixes: 720344340fb9 ("netfilter: nf_tables: GC transaction race with abort path") Reported-by: Kuan-Ting Chen Signed-off-by: Pablo Neira Ayuso (cherry picked from commit 0d459e2ffb541841714839e8228b845458ed3b27) Signed-off-by: Marcin Wcisło --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a9559b993d65..0b97c0a744227 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10069,11 +10069,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - return err; } @@ -10090,6 +10085,14 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret;