From 226b67bc515b1a02583934084388a79c2f7a00ba Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 14:56:16 -0400
Subject: [PATCH 1/8] bpf, test_run: Fix alignment problem in
 bpf_prog_test_run_skb()

jira VULN-66054
cve CVE-2022-49840
commit-author Baisong Zhong <zhongbaisong@huawei.com>
commit d3fd203f36d46aa29600a72d57a1b61af80e4a25

We got a syzkaller problem because of aarch64 alignment fault
if KFENCE enabled. When the size from user bpf program is an odd
number, like 399, 407, etc, it will cause the struct skb_shared_info's
unaligned access. As seen below:

  BUG: KFENCE: use-after-free read in __skb_clone+0x23c/0x2a0 net/core/skbuff.c:1032

  Use-after-free read at 0xffff6254fffac077 (in kfence-#213):
   __lse_atomic_add arch/arm64/include/asm/atomic_lse.h:26 [inline]
   arch_atomic_add arch/arm64/include/asm/atomic.h:28 [inline]
   arch_atomic_inc include/linux/atomic-arch-fallback.h:270 [inline]
   atomic_inc include/asm-generic/atomic-instrumented.h:241 [inline]
   __skb_clone+0x23c/0x2a0 net/core/skbuff.c:1032
   skb_clone+0xf4/0x214 net/core/skbuff.c:1481
   ____bpf_clone_redirect net/core/filter.c:2433 [inline]
   bpf_clone_redirect+0x78/0x1c0 net/core/filter.c:2420
   bpf_prog_d3839dd9068ceb51+0x80/0x330
   bpf_dispatcher_nop_func include/linux/bpf.h:728 [inline]
   bpf_test_run+0x3c0/0x6c0 net/bpf/test_run.c:53
   bpf_prog_test_run_skb+0x638/0xa7c net/bpf/test_run.c:594
   bpf_prog_test_run kernel/bpf/syscall.c:3148 [inline]
   __do_sys_bpf kernel/bpf/syscall.c:4441 [inline]
   __se_sys_bpf+0xad0/0x1634 kernel/bpf/syscall.c:4381

  kfence-#213: 0xffff6254fffac000-0xffff6254fffac196, size=407, cache=kmalloc-512

  allocated by task 15074 on cpu 0 at 1342.585390s:
   kmalloc include/linux/slab.h:568 [inline]
   kzalloc include/linux/slab.h:675 [inline]
   bpf_test_init.isra.0+0xac/0x290 net/bpf/test_run.c:191
   bpf_prog_test_run_skb+0x11c/0xa7c net/bpf/test_run.c:512
   bpf_prog_test_run kernel/bpf/syscall.c:3148 [inline]
   __do_sys_bpf kernel/bpf/syscall.c:4441 [inline]
   __se_sys_bpf+0xad0/0x1634 kernel/bpf/syscall.c:4381
   __arm64_sys_bpf+0x50/0x60 kernel/bpf/syscall.c:4381

To fix the problem, we adjust @size so that (@size + @hearoom) is a
multiple of SMP_CACHE_BYTES. So we make sure the struct skb_shared_info
is aligned to a cache line.

Fixes: 1cf1cae963c2 ("bpf: introduce BPF_PROG_TEST_RUN command")
	Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
	Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
	Cc: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20221102081620.1465154-1-zhongbaisong@huawei.com
(cherry picked from commit d3fd203f36d46aa29600a72d57a1b61af80e4a25)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 net/bpf/test_run.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index e4133baf5f72c..5d53332ea3c9d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -733,6 +733,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
 	if (user_size > size)
 		return ERR_PTR(-EMSGSIZE);
 
+	size = SKB_DATA_ALIGN(size);
 	data = kzalloc(size + headroom + tailroom, GFP_USER);
 	if (!data)
 		return ERR_PTR(-ENOMEM);

From b5e1afa49ee872e28a641672e173d29cc4224465 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 14:56:54 -0400
Subject: [PATCH 2/8] bpf, test_run: Fix use-after-free issue in
 eth_skb_pkt_type()

jira VULN-55840
cve CVE-2025-21867
commit-author Shigeru Yoshida <syoshida@redhat.com>
commit 6b3d638ca897e099fa99bd6d02189d3176f80a47

KMSAN reported a use-after-free issue in eth_skb_pkt_type()[1]. The
cause of the issue was that eth_skb_pkt_type() accessed skb's data
that didn't contain an Ethernet header. This occurs when
bpf_prog_test_run_xdp() passes an invalid value as the user_data
argument to bpf_test_init().

Fix this by returning an error when user_data is less than ETH_HLEN in
bpf_test_init(). Additionally, remove the check for "if (user_size >
size)" as it is unnecessary.

[1]
BUG: KMSAN: use-after-free in eth_skb_pkt_type include/linux/etherdevice.h:627 [inline]
BUG: KMSAN: use-after-free in eth_type_trans+0x4ee/0x980 net/ethernet/eth.c:165
 eth_skb_pkt_type include/linux/etherdevice.h:627 [inline]
 eth_type_trans+0x4ee/0x980 net/ethernet/eth.c:165
 __xdp_build_skb_from_frame+0x5a8/0xa50 net/core/xdp.c:635
 xdp_recv_frames net/bpf/test_run.c:272 [inline]
 xdp_test_run_batch net/bpf/test_run.c:361 [inline]
 bpf_test_run_xdp_live+0x2954/0x3330 net/bpf/test_run.c:390
 bpf_prog_test_run_xdp+0x148e/0x1b10 net/bpf/test_run.c:1318
 bpf_prog_test_run+0x5b7/0xa30 kernel/bpf/syscall.c:4371
 __sys_bpf+0x6a6/0xe20 kernel/bpf/syscall.c:5777
 __do_sys_bpf kernel/bpf/syscall.c:5866 [inline]
 __se_sys_bpf kernel/bpf/syscall.c:5864 [inline]
 __x64_sys_bpf+0xa4/0xf0 kernel/bpf/syscall.c:5864
 x64_sys_call+0x2ea0/0x3d90 arch/x86/include/generated/asm/syscalls_64.h:322
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xd9/0x1d0 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Uninit was created at:
 free_pages_prepare mm/page_alloc.c:1056 [inline]
 free_unref_page+0x156/0x1320 mm/page_alloc.c:2657
 __free_pages+0xa3/0x1b0 mm/page_alloc.c:4838
 bpf_ringbuf_free kernel/bpf/ringbuf.c:226 [inline]
 ringbuf_map_free+0xff/0x1e0 kernel/bpf/ringbuf.c:235
 bpf_map_free kernel/bpf/syscall.c:838 [inline]
 bpf_map_free_deferred+0x17c/0x310 kernel/bpf/syscall.c:862
 process_one_work kernel/workqueue.c:3229 [inline]
 process_scheduled_works+0xa2b/0x1b60 kernel/workqueue.c:3310
 worker_thread+0xedf/0x1550 kernel/workqueue.c:3391
 kthread+0x535/0x6b0 kernel/kthread.c:389
 ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

CPU: 1 UID: 0 PID: 17276 Comm: syz.1.16450 Not tainted 6.12.0-05490-g9bb88c659673 #8
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014

Fixes: be3d72a2896c ("bpf: move user_size out of bpf_test_init")
	Reported-by: syzkaller <syzkaller@googlegroups.com>
	Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
	Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
	Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
	Acked-by: Stanislav Fomichev <sdf@fomichev.me>
	Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20250121150643.671650-1-syoshida@redhat.com
	Signed-off-by: Alexei Starovoitov <ast@kernel.org>
(cherry picked from commit 6b3d638ca897e099fa99bd6d02189d3176f80a47)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 net/bpf/test_run.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 5d53332ea3c9d..c337034ab9a30 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -727,12 +727,9 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
 	void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
 	void *data;
 
-	if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
+	if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom)
 		return ERR_PTR(-EINVAL);
 
-	if (user_size > size)
-		return ERR_PTR(-EMSGSIZE);
-
 	size = SKB_DATA_ALIGN(size);
 	data = kzalloc(size + headroom + tailroom, GFP_USER);
 	if (!data)

From ec99fcb7a4489409b5a9eb892c2187f60e578032 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:12:09 -0400
Subject: [PATCH 3/8] RDMA/iwcm: Fix a use-after-free related to destroying CM
 IDs

jira VULN-38769
cve CVE-2024-42285
commit-author Bart Van Assche <bvanassche@acm.org>
commit aee2424246f9f1dadc33faa78990c1e2eb7826e4

iw_conn_req_handler() associates a new struct rdma_id_private (conn_id) with
an existing struct iw_cm_id (cm_id) as follows:

        conn_id->cm_id.iw = cm_id;
        cm_id->context = conn_id;
        cm_id->cm_handler = cma_iw_handler;

rdma_destroy_id() frees both the cm_id and the struct rdma_id_private. Make
sure that cm_work_handler() does not trigger a use-after-free by only
freeing of the struct rdma_id_private after all pending work has finished.

	Cc: stable@vger.kernel.org
Fixes: 59c68ac31e15 ("iw_cm: free cm_id resources on the last deref")
	Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
	Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
	Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240605145117.397751-6-bvanassche@acm.org
	Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry picked from commit aee2424246f9f1dadc33faa78990c1e2eb7826e4)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 drivers/infiniband/core/iwcm.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 2b47073c61a65..2d09d1be38f19 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -369,8 +369,10 @@ EXPORT_SYMBOL(iw_cm_disconnect);
  *
  * Clean up all resources associated with the connection and release
  * the initial reference taken by iw_create_cm_id.
+ *
+ * Returns true if and only if the last cm_id_priv reference has been dropped.
  */
-static void destroy_cm_id(struct iw_cm_id *cm_id)
+static bool destroy_cm_id(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 	struct ib_qp *qp;
@@ -440,7 +442,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
 		iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
 	}
 
-	(void)iwcm_deref_id(cm_id_priv);
+	return iwcm_deref_id(cm_id_priv);
 }
 
 /*
@@ -451,7 +453,8 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
  */
 void iw_destroy_cm_id(struct iw_cm_id *cm_id)
 {
-	destroy_cm_id(cm_id);
+	if (!destroy_cm_id(cm_id))
+		flush_workqueue(iwcm_wq);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
@@ -1035,7 +1038,7 @@ static void cm_work_handler(struct work_struct *_work)
 		if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
 			ret = process_event(cm_id_priv, &levent);
 			if (ret)
-				destroy_cm_id(&cm_id_priv->id);
+				WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
 		} else
 			pr_debug("dropping event %d\n", levent.event);
 		if (iwcm_deref_id(cm_id_priv))

From b1db60a19be3a29311079a171432e7350055507a Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:13:01 -0400
Subject: [PATCH 4/8] RDMA/iwcm: Fix use-after-free of work objects after cm_id
 destruction

jira VULN-72085
cve CVE-2025-38211
commit-author Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
commit 6883b680e703c6b2efddb4e7a8d891ce1803d06b

The commit 59c68ac31e15 ("iw_cm: free cm_id resources on the last
deref") simplified cm_id resource management by freeing cm_id once all
references to the cm_id were removed. The references are removed either
upon completion of iw_cm event handlers or when the application destroys
the cm_id. This commit introduced the use-after-free condition where
cm_id_private object could still be in use by event handler works during
the destruction of cm_id. The commit aee2424246f9 ("RDMA/iwcm: Fix a
use-after-free related to destroying CM IDs") addressed this use-after-
free by flushing all pending works at the cm_id destruction.

However, still another use-after-free possibility remained. It happens
with the work objects allocated for each cm_id_priv within
alloc_work_entries() during cm_id creation, and subsequently freed in
dealloc_work_entries() once all references to the cm_id are removed.
If the cm_id's last reference is decremented in the event handler work,
the work object for the work itself gets removed, and causes the use-
after-free BUG below:

  BUG: KASAN: slab-use-after-free in __pwq_activate_work+0x1ff/0x250
  Read of size 8 at addr ffff88811f9cf800 by task kworker/u16:1/147091

  CPU: 2 UID: 0 PID: 147091 Comm: kworker/u16:1 Not tainted 6.15.0-rc2+ #27 PREEMPT(voluntary)
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014
  Workqueue:  0x0 (iw_cm_wq)
  Call Trace:
   <TASK>
   dump_stack_lvl+0x6a/0x90
   print_report+0x174/0x554
   ? __virt_addr_valid+0x208/0x430
   ? __pwq_activate_work+0x1ff/0x250
   kasan_report+0xae/0x170
   ? __pwq_activate_work+0x1ff/0x250
   __pwq_activate_work+0x1ff/0x250
   pwq_dec_nr_in_flight+0x8c5/0xfb0
   process_one_work+0xc11/0x1460
   ? __pfx_process_one_work+0x10/0x10
   ? assign_work+0x16c/0x240
   worker_thread+0x5ef/0xfd0
   ? __pfx_worker_thread+0x10/0x10
   kthread+0x3b0/0x770
   ? __pfx_kthread+0x10/0x10
   ? rcu_is_watching+0x11/0xb0
   ? _raw_spin_unlock_irq+0x24/0x50
   ? rcu_is_watching+0x11/0xb0
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x30/0x70
   ? __pfx_kthread+0x10/0x10
   ret_from_fork_asm+0x1a/0x30
   </TASK>

  Allocated by task 147416:
   kasan_save_stack+0x2c/0x50
   kasan_save_track+0x10/0x30
   __kasan_kmalloc+0xa6/0xb0
   alloc_work_entries+0xa9/0x260 [iw_cm]
   iw_cm_connect+0x23/0x4a0 [iw_cm]
   rdma_connect_locked+0xbfd/0x1920 [rdma_cm]
   nvme_rdma_cm_handler+0x8e5/0x1b60 [nvme_rdma]
   cma_cm_event_handler+0xae/0x320 [rdma_cm]
   cma_work_handler+0x106/0x1b0 [rdma_cm]
   process_one_work+0x84f/0x1460
   worker_thread+0x5ef/0xfd0
   kthread+0x3b0/0x770
   ret_from_fork+0x30/0x70
   ret_from_fork_asm+0x1a/0x30

  Freed by task 147091:
   kasan_save_stack+0x2c/0x50
   kasan_save_track+0x10/0x30
   kasan_save_free_info+0x37/0x60
   __kasan_slab_free+0x4b/0x70
   kfree+0x13a/0x4b0
   dealloc_work_entries+0x125/0x1f0 [iw_cm]
   iwcm_deref_id+0x6f/0xa0 [iw_cm]
   cm_work_handler+0x136/0x1ba0 [iw_cm]
   process_one_work+0x84f/0x1460
   worker_thread+0x5ef/0xfd0
   kthread+0x3b0/0x770
   ret_from_fork+0x30/0x70
   ret_from_fork_asm+0x1a/0x30

  Last potentially related work creation:
   kasan_save_stack+0x2c/0x50
   kasan_record_aux_stack+0xa3/0xb0
   __queue_work+0x2ff/0x1390
   queue_work_on+0x67/0xc0
   cm_event_handler+0x46a/0x820 [iw_cm]
   siw_cm_upcall+0x330/0x650 [siw]
   siw_cm_work_handler+0x6b9/0x2b20 [siw]
   process_one_work+0x84f/0x1460
   worker_thread+0x5ef/0xfd0
   kthread+0x3b0/0x770
   ret_from_fork+0x30/0x70
   ret_from_fork_asm+0x1a/0x30

This BUG is reproducible by repeating the blktests test case nvme/061
for the rdma transport and the siw driver.

To avoid the use-after-free of cm_id_private work objects, ensure that
the last reference to the cm_id is decremented not in the event handler
works, but in the cm_id destruction context. For that purpose, move
iwcm_deref_id() call from destroy_cm_id() to the callers of
destroy_cm_id(). In iw_destroy_cm_id(), call iwcm_deref_id() after
flushing the pending works.

During the fix work, I noticed that iw_destroy_cm_id() is called from
cm_work_handler() and process_event() context. However, the comment of
iw_destroy_cm_id() notes that the function "cannot be called by the
event thread". Drop the false comment.

Closes: https://lore.kernel.org/linux-rdma/r5676e754sv35aq7cdsqrlnvyhiq5zktteaurl7vmfih35efko@z6lay7uypy3c/
Fixes: 59c68ac31e15 ("iw_cm: free cm_id resources on the last deref")
	Cc: stable@vger.kernel.org
	Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Link: https://patch.msgid.link/20250510101036.1756439-1-shinichiro.kawasaki@wdc.com
	Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
	Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry picked from commit 6883b680e703c6b2efddb4e7a8d891ce1803d06b)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 drivers/infiniband/core/iwcm.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 2d09d1be38f19..9abc2191eece2 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -367,12 +367,9 @@ EXPORT_SYMBOL(iw_cm_disconnect);
 /*
  * CM_ID <-- DESTROYING
  *
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
- *
- * Returns true if and only if the last cm_id_priv reference has been dropped.
+ * Clean up all resources associated with the connection.
  */
-static bool destroy_cm_id(struct iw_cm_id *cm_id)
+static void destroy_cm_id(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 	struct ib_qp *qp;
@@ -441,20 +438,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id)
 		iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
 		iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
 	}
-
-	return iwcm_deref_id(cm_id_priv);
 }
 
 /*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
  */
 void iw_destroy_cm_id(struct iw_cm_id *cm_id)
 {
-	if (!destroy_cm_id(cm_id))
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	destroy_cm_id(cm_id);
+	if (refcount_read(&cm_id_priv->refcount) > 1)
 		flush_workqueue(iwcm_wq);
+	iwcm_deref_id(cm_id_priv);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
@@ -1037,8 +1036,10 @@ static void cm_work_handler(struct work_struct *_work)
 
 		if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
 			ret = process_event(cm_id_priv, &levent);
-			if (ret)
-				WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
+			if (ret) {
+				destroy_cm_id(&cm_id_priv->id);
+				WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+			}
 		} else
 			pr_debug("dropping event %d\n", levent.event);
 		if (iwcm_deref_id(cm_id_priv))

From 1c6c444a4582d589c64dafb42d6137907503cffd Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:48:11 -0400
Subject: [PATCH 5/8] RDMA/iwcm: Fix
 WARNING:at_kernel/workqueue.c:#check_flush_dependency

jira VULN-45117
cve CVE-2024-47696
commit-author Zhu Yanjun <yanjun.zhu@linux.dev>
commit 86dfdd8288907f03c18b7fb462e0e232c4f98d89

In the commit aee2424246f9 ("RDMA/iwcm: Fix a use-after-free related to
destroying CM IDs"), the function flush_workqueue is invoked to flush the
work queue iwcm_wq.

But at that time, the work queue iwcm_wq was created via the function
alloc_ordered_workqueue without the flag WQ_MEM_RECLAIM.

Because the current process is trying to flush the whole iwcm_wq, if
iwcm_wq doesn't have the flag WQ_MEM_RECLAIM, verify that the current
process is not reclaiming memory or running on a workqueue which doesn't
have the flag WQ_MEM_RECLAIM as that can break forward-progress guarantee
leading to a deadlock.

The call trace is as below:

[  125.350876][ T1430] Call Trace:
[  125.356281][ T1430]  <TASK>
[ 125.361285][ T1430] ? __warn (kernel/panic.c:693)
[ 125.367640][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9))
[ 125.375689][ T1430] ? report_bug (lib/bug.c:180 lib/bug.c:219)
[ 125.382505][ T1430] ? handle_bug (arch/x86/kernel/traps.c:239)
[ 125.388987][ T1430] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
[ 125.395831][ T1430] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621)
[ 125.403125][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9))
[ 125.410984][ T1430] ? check_flush_dependency (kernel/workqueue.c:3706 (discriminator 9))
[ 125.418764][ T1430] __flush_workqueue (kernel/workqueue.c:3970)
[ 125.426021][ T1430] ? __pfx___might_resched (kernel/sched/core.c:10151)
[ 125.433431][ T1430] ? destroy_cm_id (drivers/infiniband/core/iwcm.c:375) iw_cm
[ 125.441209][ T1430] ? __pfx___flush_workqueue (kernel/workqueue.c:3910)
[ 125.473900][ T1430] ? _raw_spin_lock_irqsave (arch/x86/include/asm/atomic.h:107 include/linux/atomic/atomic-arch-fallback.h:2170 include/linux/atomic/atomic-instrumented.h:1302 include/asm-generic/qspinlock.h:111 include/linux/spinlock.h:187 include/linux/spinlock_api_smp.h:111 kernel/locking/spinlock.c:162)
[ 125.473909][ T1430] ? __pfx__raw_spin_lock_irqsave (kernel/locking/spinlock.c:161)
[ 125.482537][ T1430] _destroy_id (drivers/infiniband/core/cma.c:2044) rdma_cm
[ 125.495072][ T1430] nvme_rdma_free_queue (drivers/nvme/host/rdma.c:656 drivers/nvme/host/rdma.c:650) nvme_rdma
[ 125.505827][ T1430] nvme_rdma_reset_ctrl_work (drivers/nvme/host/rdma.c:2180) nvme_rdma
[ 125.505831][ T1430] process_one_work (kernel/workqueue.c:3231)
[ 125.515122][ T1430] worker_thread (kernel/workqueue.c:3306 kernel/workqueue.c:3393)
[ 125.515127][ T1430] ? __pfx_worker_thread (kernel/workqueue.c:3339)
[ 125.531837][ T1430] kthread (kernel/kthread.c:389)
[ 125.539864][ T1430] ? __pfx_kthread (kernel/kthread.c:342)
[ 125.550628][ T1430] ret_from_fork (arch/x86/kernel/process.c:147)
[ 125.558840][ T1430] ? __pfx_kthread (kernel/kthread.c:342)
[ 125.558844][ T1430] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
[  125.566487][ T1430]  </TASK>
[  125.566488][ T1430] ---[ end trace 0000000000000000 ]---

Fixes: aee2424246f9 ("RDMA/iwcm: Fix a use-after-free related to destroying CM IDs")
Link: https://patch.msgid.link/r/20240820113336.19860-1-yanjun.zhu@linux.dev
	Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202408151633.fc01893c-oliver.sang@intel.com
	Tested-by: kernel test robot <oliver.sang@intel.com>
	Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
	Reviewed-by: Bart Van Assche <bvanassche@acm.org>
	Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 86dfdd8288907f03c18b7fb462e0e232c4f98d89)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 drivers/infiniband/core/iwcm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 9abc2191eece2..ce41f235af253 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -1192,7 +1192,7 @@ static int __init iw_cm_init(void)
 	if (ret)
 		return ret;
 
-	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
 	if (!iwcm_wq)
 		goto err_alloc;
 

From 9e166df3aafbea9491c1ce0c1b1e5115b6956104 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:33:53 -0400
Subject: [PATCH 6/8] net: forward_alloc_get depends on CONFIG_MPTCP

jira VULN-65244
cve-pre CVE-2025-22058
commit-author Eric Dumazet <edumazet@google.com>
commit 6c302e799a0d4be1362f505453b714fe05d91f2a

(struct proto)->sk_forward_alloc is currently only used by MPTCP.

	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 6c302e799a0d4be1362f505453b714fe05d91f2a)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 include/net/sock.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index f4a40f53441d7..074c9b7d25c71 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,7 +1273,9 @@ struct proto {
 	unsigned int		inuse_idx;
 #endif
 
+#if IS_ENABLED(CONFIG_MPTCP)
 	int			(*forward_alloc_get)(const struct sock *sk);
+#endif
 
 	bool			(*stream_memory_free)(const struct sock *sk, int wake);
 	bool			(*sock_is_readable)(struct sock *sk);
@@ -1363,10 +1365,11 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int
 
 static inline int sk_forward_alloc_get(const struct sock *sk)
 {
-	if (!sk->sk_prot->forward_alloc_get)
-		return sk->sk_forward_alloc;
-
-	return sk->sk_prot->forward_alloc_get(sk);
+#if IS_ENABLED(CONFIG_MPTCP)
+	if (sk->sk_prot->forward_alloc_get)
+		return sk->sk_prot->forward_alloc_get(sk);
+#endif
+	return sk->sk_forward_alloc;
 }
 
 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)

From 4c81ea907fa325347c2a791c56e29d1fd176c9a0 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:34:29 -0400
Subject: [PATCH 7/8] net: annotate data-races around sk->sk_forward_alloc

jira VULN-65244
cve-pre CVE-2025-22058
commit-author Eric Dumazet <edumazet@google.com>
commit 5e6300e7b3a4ab5b72a82079753868e91fbf9efc

Every time sk->sk_forward_alloc is read locklessly,
add a READ_ONCE().

Add sk_forward_alloc_add() helper to centralize updates,
to reduce number of WRITE_ONCE().

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 5e6300e7b3a4ab5b72a82079753868e91fbf9efc)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 include/net/sock.h    | 12 +++++++++---
 net/core/sock.c       |  8 ++++----
 net/ipv4/tcp_output.c |  2 +-
 net/ipv4/udp.c        |  6 +++---
 net/mptcp/protocol.c  |  6 +++---
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 074c9b7d25c71..a9d88975c5aec 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1033,6 +1033,12 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
 	WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
 }
 
+static inline void sk_forward_alloc_add(struct sock *sk, int val)
+{
+	/* Paired with lockless reads of sk->sk_forward_alloc */
+	WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
+}
+
 void sk_stream_write_space(struct sock *sk);
 
 /* OOB backlog add */
@@ -1369,7 +1375,7 @@ static inline int sk_forward_alloc_get(const struct sock *sk)
 	if (sk->sk_prot->forward_alloc_get)
 		return sk->sk_prot->forward_alloc_get(sk);
 #endif
-	return sk->sk_forward_alloc;
+	return READ_ONCE(sk->sk_forward_alloc);
 }
 
 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
@@ -1668,7 +1674,7 @@ static inline void sk_mem_charge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
-	sk->sk_forward_alloc -= size;
+	sk_forward_alloc_add(sk, -size);
 }
 
 /* the following macros control memory reclaiming in mptcp_rmem_uncharge()
@@ -1680,7 +1686,7 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
-	sk->sk_forward_alloc += size;
+	sk_forward_alloc_add(sk, size);
 	sk_mem_reclaim(sk);
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index b86b905c1bf9a..39f201d7377bf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1021,7 +1021,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
 		return -ENOMEM;
 	}
-	sk->sk_forward_alloc += pages << PAGE_SHIFT;
+	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
 
 	sk->sk_reserved_mem += pages << PAGE_SHIFT;
 
@@ -2974,10 +2974,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 {
 	int ret, amt = sk_mem_pages(size);
 
-	sk->sk_forward_alloc += amt << PAGE_SHIFT;
+	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
 	if (!ret)
-		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
+		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
 	return ret;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -3010,7 +3010,7 @@ EXPORT_SYMBOL(__sk_mem_reduce_allocated);
 void __sk_mem_reclaim(struct sock *sk, int amount)
 {
 	amount >>= PAGE_SHIFT;
-	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
+	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
 	__sk_mem_reduce_allocated(sk, amount);
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 967356a6d45ef..c08ac464cc218 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3386,7 +3386,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	if (delta <= 0)
 		return;
 	amt = sk_mem_pages(delta);
-	sk->sk_forward_alloc += amt << PAGE_SHIFT;
+	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
 	sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9d695d5a21230..157da8d986cc1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1462,9 +1462,9 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 		spin_lock(&sk_queue->lock);
 
 
-	sk->sk_forward_alloc += size;
+	sk_forward_alloc_add(sk, size);
 	amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
-	sk->sk_forward_alloc -= amt;
+	sk_forward_alloc_add(sk, -amt);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
@@ -1570,7 +1570,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_forward_alloc += delta;
 	}
 
-	sk->sk_forward_alloc -= size;
+	sk_forward_alloc_add(sk, -size);
 
 	/* no need to setup a destructor, we will explicitly release the
 	 * forward allocated memory on dequeue
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index cf9b7fab4461c..139fc36f62645 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1794,7 +1794,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		}
 
 		/* data successfully copied into the write queue */
-		sk->sk_forward_alloc -= total_ts;
+		sk_forward_alloc_add(sk, -total_ts);
 		copied += psize;
 		dfrag->data_len += psize;
 		frag_truesize += psize;
@@ -3198,7 +3198,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
 	/* move all the rx fwd alloc into the sk_mem_reclaim_final in
 	 * inet_sock_destruct() will dispose it
 	 */
-	sk->sk_forward_alloc += msk->rmem_fwd_alloc;
+	sk_forward_alloc_add(sk, msk->rmem_fwd_alloc);
 	msk->rmem_fwd_alloc = 0;
 	mptcp_token_destroy(msk);
 	mptcp_pm_free_anno_list(msk);
@@ -3479,7 +3479,7 @@ static void mptcp_shutdown(struct sock *sk, int how)
 
 static int mptcp_forward_alloc_get(const struct sock *sk)
 {
-	return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
+	return READ_ONCE(sk->sk_forward_alloc) + mptcp_sk(sk)->rmem_fwd_alloc;
 }
 
 static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)

From d61e53eee6e1a8e4e484f0b25a7ec2c3b35ff795 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Thu, 18 Sep 2025 15:34:39 -0400
Subject: [PATCH 8/8] udp: Fix memory accounting leak.

jira VULN-65244
cve CVE-2025-22058
commit-author Kuniyuki Iwashima <kuniyu@amazon.com>
commit df207de9d9e7a4d92f8567e2c539d9c8c12fd99d

Matt Dowling reported a weird UDP memory usage issue.

Under normal operation, the UDP memory usage reported in /proc/net/sockstat
remains close to zero.  However, it occasionally spiked to 524,288 pages
and never dropped.  Moreover, the value doubled when the application was
terminated.  Finally, it caused intermittent packet drops.

We can reproduce the issue with the script below [0]:

  1. /proc/net/sockstat reports 0 pages

    # cat /proc/net/sockstat | grep UDP:
    UDP: inuse 1 mem 0

  2. Run the script till the report reaches 524,288

    # python3 test.py & sleep 5
    # cat /proc/net/sockstat | grep UDP:
    UDP: inuse 3 mem 524288  <-- (INT_MAX + 1) >> PAGE_SHIFT

  3. Kill the socket and confirm the number never drops

    # pkill python3 && sleep 5
    # cat /proc/net/sockstat | grep UDP:
    UDP: inuse 1 mem 524288

  4. (necessary since v6.0) Trigger proto_memory_pcpu_drain()

    # python3 test.py & sleep 1 && pkill python3

  5. The number doubles

    # cat /proc/net/sockstat | grep UDP:
    UDP: inuse 1 mem 1048577

The application set INT_MAX to SO_RCVBUF, which triggered an integer
overflow in udp_rmem_release().

When a socket is close()d, udp_destruct_common() purges its receive
queue and sums up skb->truesize in the queue.  This total is calculated
and stored in a local unsigned integer variable.

The total size is then passed to udp_rmem_release() to adjust memory
accounting.  However, because the function takes a signed integer
argument, the total size can wrap around, causing an overflow.

Then, the released amount is calculated as follows:

  1) Add size to sk->sk_forward_alloc.
  2) Round down sk->sk_forward_alloc to the nearest lower multiple of
      PAGE_SIZE and assign it to amount.
  3) Subtract amount from sk->sk_forward_alloc.
  4) Pass amount >> PAGE_SHIFT to __sk_mem_reduce_allocated().

When the issue occurred, the total in udp_destruct_common() was 2147484480
(INT_MAX + 833), which was cast to -2147482816 in udp_rmem_release().

At 1) sk->sk_forward_alloc is changed from 3264 to -2147479552, and
2) sets -2147479552 to amount.  3) reverts the wraparound, so we don't
see a warning in inet_sock_destruct().  However, udp_memory_allocated
ends up doubling at 4).

Since commit 3cd3399dd7a8 ("net: implement per-cpu reserves for
memory_allocated"), memory usage no longer doubles immediately after
a socket is close()d because __sk_mem_reduce_allocated() caches the
amount in udp_memory_per_cpu_fw_alloc.  However, the next time a UDP
socket receives a packet, the subtraction takes effect, causing UDP
memory usage to double.

This issue makes further memory allocation fail once the socket's
sk->sk_rmem_alloc exceeds net.ipv4.udp_rmem_min, resulting in packet
drops.

To prevent this issue, let's use unsigned int for the calculation and
call sk_forward_alloc_add() only once for the small delta.

Note that first_packet_length() also potentially has the same problem.

[0]:
from socket import *

SO_RCVBUFFORCE = 33
INT_MAX = (2 ** 31) - 1

s = socket(AF_INET, SOCK_DGRAM)
s.bind(('', 0))
s.setsockopt(SOL_SOCKET, SO_RCVBUFFORCE, INT_MAX)

c = socket(AF_INET, SOCK_DGRAM)
c.connect(s.getsockname())

data = b'a' * 100

while True:
    c.send(data)

Fixes: f970bd9e3a06 ("udp: implement memory accounting helpers")
	Reported-by: Matt Dowling <madowlin@amazon.com>
	Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
	Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250401184501.67377-3-kuniyu@amazon.com
	Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit df207de9d9e7a4d92f8567e2c539d9c8c12fd99d)
	Signed-off-by: Brett Mastbergen <bmastbergen@ciq.com>
---
 net/ipv4/udp.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 157da8d986cc1..2f2fea95038d0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1436,12 +1436,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb)
 }
 
 /* fully reclaim rmem/fwd memory allocated for skb */
-static void udp_rmem_release(struct sock *sk, int size, int partial,
-			     bool rx_queue_lock_held)
+static void udp_rmem_release(struct sock *sk, unsigned int size,
+			     int partial, bool rx_queue_lock_held)
 {
 	struct udp_sock *up = udp_sk(sk);
 	struct sk_buff_head *sk_queue;
-	int amt;
+	unsigned int amt;
 
 	if (likely(partial)) {
 		up->forward_deficit += size;
@@ -1461,10 +1461,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 	if (!rx_queue_lock_held)
 		spin_lock(&sk_queue->lock);
 
-
-	sk_forward_alloc_add(sk, size);
-	amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
-	sk_forward_alloc_add(sk, -amt);
+	amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+	sk_forward_alloc_add(sk, size - amt);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
@@ -1648,7 +1646,7 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
 
 static struct sk_buff *__first_packet_length(struct sock *sk,
 					     struct sk_buff_head *rcvq,
-					     int *total)
+					     unsigned int *total)
 {
 	struct sk_buff *skb;
 
@@ -1681,8 +1679,8 @@ static int first_packet_length(struct sock *sk)
 {
 	struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
 	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+	unsigned int total = 0;
 	struct sk_buff *skb;
-	int total = 0;
 	int res;
 
 	spin_lock_bh(&rcvq->lock);