From df02150025a3ac134a29c3a847b334f53d4b0c4a Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Tue, 19 Dec 2023 23:19:43 +0530 Subject: [PATCH 01/26] nfc: llcp_core: Hold a ref to llcp_local->dev when holding a ref to llcp_local [ Upstream commit c95f919567d6f1914f13350af61a1b044ac85014 ] llcp_sock_sendmsg() calls nfc_llcp_send_ui_frame() which in turn calls nfc_alloc_send_skb(), which accesses the nfc_dev from the llcp_sock for getting the headroom and tailroom needed for skb allocation. Parallelly the nfc_dev can be freed, as the refcount is decreased via nfc_free_device(), leading to a UAF reported by Syzkaller, which can be summarized as follows: (1) llcp_sock_sendmsg() -> nfc_llcp_send_ui_frame() -> nfc_alloc_send_skb() -> Dereference *nfc_dev (2) virtual_ncidev_close() -> nci_free_device() -> nfc_free_device() -> put_device() -> nfc_release() -> Free *nfc_dev When a reference to llcp_local is acquired, we do not acquire the same for the nfc_dev. This leads to freeing even when the llcp_local is in use, and this is the case with the UAF described above too. Thus, when we acquire a reference to llcp_local, we should acquire a reference to nfc_dev, and release the references appropriately later. References for llcp_local is initialized in nfc_llcp_register_device() (which is called by nfc_register_device()). Thus, we should acquire a reference to nfc_dev there. nfc_unregister_device() calls nfc_llcp_unregister_device() which in turn calls nfc_llcp_local_put(). Thus, the reference to nfc_dev is appropriately released later. Reported-and-tested-by: syzbot+bbe84a4010eeea00982d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bbe84a4010eeea00982d Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reviewed-by: Suman Ghosh Signed-off-by: Siddh Raman Pant Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/nfc/llcp_core.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index c30b28465e644..a217830f0f34c 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -157,6 +157,13 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { + /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever + * we hold a reference to local, we also need to hold a reference to + * the device to avoid UAF. + */ + if (!nfc_get_device(local->dev->idx)) + return NULL; + kref_get(&local->ref); return local; @@ -190,10 +197,18 @@ static void local_release(struct kref *ref) int nfc_llcp_local_put(struct nfc_llcp_local *local) { + struct nfc_dev *dev; + int ret; + if (local == NULL) return 0; - return kref_put(&local->ref, local_release); + dev = local->dev; + + ret = kref_put(&local->ref, local_release); + nfc_put_device(dev); + + return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, @@ -951,8 +966,17 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, } new_sock = nfc_llcp_sock(new_sk); - new_sock->dev = local->dev; + new_sock->local = nfc_llcp_local_get(local); + if (!new_sock->local) { + reason = LLCP_DM_REJ; + sock_put(&new_sock->sk); + release_sock(&sock->sk); + sock_put(&sock->sk); + goto fail; + } + + new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; @@ -1584,7 +1608,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) if (local == NULL) return -ENOMEM; - local->dev = ndev; + /* As we are going to initialize local's refcount, we need to get the + * nfc_dev to avoid UAF, otherwise there is no point in continuing. + * See nfc_llcp_local_get(). + */ + local->dev = nfc_get_device(ndev->idx); + if (!local->dev) { + kfree(local); + return -ENODEV; + } + INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); From e4f367ae64ff2ab90407e21ce0928b749a0c0c50 Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Wed, 29 Nov 2023 11:23:11 +0100 Subject: [PATCH 02/26] i40e: Fix filter input checks to prevent config with invalid values [ Upstream commit 3e48041d9820c17e0a51599d12e66c6e12a8d08d ] Prevent VF from configuring filters with unsupported actions or use REDIRECT action with invalid tc number. Current checks could cause out of bounds access on PF side. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Reviewed-by: Andrii Staikov Signed-off-by: Sudheer Mogilappagari Signed-off-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Bharathi Sreenivas Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 1527c67b487b2..32b19c4c581b6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3143,16 +3143,16 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf, bool found = false; int bkt; - if (!tc_filter->action) { + if (tc_filter->action != VIRTCHNL_ACTION_TC_REDIRECT) { dev_info(&pf->pdev->dev, - "VF %d: Currently ADq doesn't support Drop Action\n", - vf->vf_id); + "VF %d: ADQ doesn't support this action (%d)\n", + vf->vf_id, tc_filter->action); goto err; } /* action_meta is TC number here to which the filter is applied */ if (!tc_filter->action_meta || - tc_filter->action_meta > I40E_MAX_VF_VSI) { + tc_filter->action_meta > vf->num_tc) { dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n", vf->vf_id, tc_filter->action_meta); goto err; From 67188c52c06ad8bf3890d9a2dc8cedc9a707cb61 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 21 Dec 2023 10:25:31 +0800 Subject: [PATCH 03/26] net: sched: em_text: fix possible memory leak in em_text_destroy() [ Upstream commit 8fcb0382af6f1ef50936f1be05b8149eb2f88496 ] m->data needs to be freed when em_text_destroy is called. Fixes: d675c989ed2d ("[PKT_SCHED]: Packet classification based on textsearch (ematch)") Acked-by: Jamal Hadi Salim Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/em_text.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 73e2ed576ceb3..cbf44783024f3 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -101,8 +101,10 @@ static int em_text_change(struct net *net, void *data, int len, static void em_text_destroy(struct tcf_ematch *m) { - if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) { textsearch_destroy(EM_TEXT_PRIV(m)->config); + kfree(EM_TEXT_PRIV(m)); + } } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) From ee49874f862ffa7d155fa4b2ef12a4ad1504c184 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 28 Dec 2023 20:39:02 +0100 Subject: [PATCH 04/26] ARM: sun9i: smp: Fix array-index-out-of-bounds read in sunxi_mc_smp_init [ Upstream commit 72ad3b772b6d393701df58ba1359b0bb346a19ed ] Running a multi-arch kernel (multi_v7_defconfig) on a Raspberry Pi 3B+ with enabled CONFIG_UBSAN triggers the following warning: UBSAN: array-index-out-of-bounds in arch/arm/mach-sunxi/mc_smp.c:810:29 index 2 is out of range for type 'sunxi_mc_smp_data [2]' CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6-00248-g5254c0cbc92d Hardware name: BCM2835 unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x40/0x4c dump_stack_lvl from ubsan_epilogue+0x8/0x34 ubsan_epilogue from __ubsan_handle_out_of_bounds+0x78/0x80 __ubsan_handle_out_of_bounds from sunxi_mc_smp_init+0xe4/0x4cc sunxi_mc_smp_init from do_one_initcall+0xa0/0x2fc do_one_initcall from kernel_init_freeable+0xf4/0x2f4 kernel_init_freeable from kernel_init+0x18/0x158 kernel_init from ret_from_fork+0x14/0x28 Since the enabled method couldn't match with any entry from sunxi_mc_smp_data, the value of the index shouldn't be used right after the loop. So move it after the check of ret in order to have a valid index. Fixes: 1631090e34f5 ("ARM: sun9i: smp: Add is_a83t field") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20231228193903.9078-1-wahrenst@gmx.net Reviewed-by: Chen-Yu Tsai Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin --- arch/arm/mach-sunxi/mc_smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c index ff173e67eed21..527bb82072d9a 100644 --- a/arch/arm/mach-sunxi/mc_smp.c +++ b/arch/arm/mach-sunxi/mc_smp.c @@ -805,12 +805,12 @@ static int __init sunxi_mc_smp_init(void) break; } - is_a83t = sunxi_mc_smp_data[i].is_a83t; - of_node_put(node); if (ret) return -ENODEV; + is_a83t = sunxi_mc_smp_data[i].is_a83t; + if (!sunxi_mc_smp_cpu_table_init()) return -EINVAL; From fe7f1685f44bf8c24e3d9d9bb5196c668513aedb Mon Sep 17 00:00:00 2001 From: Adrian Cinal Date: Thu, 28 Dec 2023 14:56:38 +0100 Subject: [PATCH 05/26] net: bcmgenet: Fix FCS generation for fragmented skbuffs [ Upstream commit e584f2ff1e6cc9b1d99e8a6b0f3415940d1b3eb3 ] The flag DMA_TX_APPEND_CRC was only written to the first DMA descriptor in the TX path, where each descriptor corresponds to a single skbuff fragment (or the skbuff head). This led to packets with no FCS appearing on the wire if the kernel allocated the packet in fragments, which would always happen when using PACKET_MMAP/TPACKET (cf. tpacket_fill_skb() in net/af_packet.c). Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file") Signed-off-by: Adrian Cinal Acked-by: Doug Berger Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20231228135638.1339245-1-adriancinal1@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index d518577313145..8bbc5dcf8cb43 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -1650,8 +1650,10 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev) /* Note: if we ever change from DMA_TX_APPEND_CRC below we * will need to restore software padding of "runt" packets */ + len_stat |= DMA_TX_APPEND_CRC; + if (!i) { - len_stat |= DMA_TX_APPEND_CRC | DMA_SOP; + len_stat |= DMA_SOP; if (skb->ip_summed == CHECKSUM_PARTIAL) len_stat |= DMA_TX_DO_CSUM; } From 0c220aefdb064f4fff02732ecd302f0901b15dc0 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 21 Dec 2023 09:12:30 -0400 Subject: [PATCH 06/26] net: Save and restore msg_namelen in sock_sendmsg [ Upstream commit 01b2885d9415152bcb12ff1f7788f500a74ea0ed ] Commit 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()") made sock_sendmsg save the incoming msg_name pointer and restore it before returning, to insulate the caller against msg_name being changed by the called code. If the address length was also changed however, we may return with an inconsistent structure where the length doesn't match the address, and attempts to reuse it may lead to lost packets. For example, a kernel that doesn't have commit 1c5950fc6fe9 ("udp6: fix potential access to stale information") will replace a v4 mapped address with its ipv4 equivalent, and shorten namelen accordingly from 28 to 16. If the caller attempts to reuse the resulting msg structure, it will have the original ipv6 (v4 mapped) address but an incorrect v4 length. Fixes: 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()") Signed-off-by: Marc Dionne Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/socket.c b/net/socket.c index 328f584345fb3..49ac98cfda423 100644 --- a/net/socket.c +++ b/net/socket.c @@ -675,6 +675,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; struct sockaddr_storage address; + int save_len = msg->msg_namelen; int ret; if (msg->msg_name) { @@ -684,6 +685,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) ret = __sock_sendmsg(sock, msg); msg->msg_name = save_addr; + msg->msg_namelen = save_len; return ret; } From af9ec305b68635ae6d8eb957e91e723bfee0d336 Mon Sep 17 00:00:00 2001 From: Ke Xiao Date: Mon, 18 Dec 2023 15:08:50 +0800 Subject: [PATCH 07/26] i40e: fix use-after-free in i40e_aqc_add_filters() [ Upstream commit 6a15584e99db8918b60e507539c7446375dcf366 ] Commit 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()") avoided use-after-free issues, by increasing refcount during update the VSI filter list to the HW. However, it missed the unicast situation. When deleting an unicast FDB entry, the i40e driver will release the mac_filter, and i40e_service_task will concurrently request firmware to add the mac_filter, which will lead to the following use-after-free issue. Fix again for both netdev->uc and netdev->mc. BUG: KASAN: use-after-free in i40e_aqc_add_filters+0x55c/0x5b0 [i40e] Read of size 2 at addr ffff888eb3452d60 by task kworker/8:7/6379 CPU: 8 PID: 6379 Comm: kworker/8:7 Kdump: loaded Tainted: G Workqueue: i40e i40e_service_task [i40e] Call Trace: dump_stack+0x71/0xab print_address_description+0x6b/0x290 kasan_report+0x14a/0x2b0 i40e_aqc_add_filters+0x55c/0x5b0 [i40e] i40e_sync_vsi_filters+0x1676/0x39c0 [i40e] i40e_service_task+0x1397/0x2bb0 [i40e] process_one_work+0x56a/0x11f0 worker_thread+0x8f/0xf40 kthread+0x2a0/0x390 ret_from_fork+0x1f/0x40 Allocated by task 21948: kasan_kmalloc+0xa6/0xd0 kmem_cache_alloc_trace+0xdb/0x1c0 i40e_add_filter+0x11e/0x520 [i40e] i40e_addr_sync+0x37/0x60 [i40e] __hw_addr_sync_dev+0x1f5/0x2f0 i40e_set_rx_mode+0x61/0x1e0 [i40e] dev_uc_add_excl+0x137/0x190 i40e_ndo_fdb_add+0x161/0x260 [i40e] rtnl_fdb_add+0x567/0x950 rtnetlink_rcv_msg+0x5db/0x880 netlink_rcv_skb+0x254/0x380 netlink_unicast+0x454/0x610 netlink_sendmsg+0x747/0xb00 sock_sendmsg+0xe2/0x120 __sys_sendto+0x1ae/0x290 __x64_sys_sendto+0xdd/0x1b0 do_syscall_64+0xa0/0x370 entry_SYSCALL_64_after_hwframe+0x65/0xca Freed by task 21948: __kasan_slab_free+0x137/0x190 kfree+0x8b/0x1b0 __i40e_del_filter+0x116/0x1e0 [i40e] i40e_del_mac_filter+0x16c/0x300 [i40e] i40e_addr_unsync+0x134/0x1b0 [i40e] __hw_addr_sync_dev+0xff/0x2f0 i40e_set_rx_mode+0x61/0x1e0 [i40e] dev_uc_del+0x77/0x90 rtnl_fdb_del+0x6a5/0x860 rtnetlink_rcv_msg+0x5db/0x880 netlink_rcv_skb+0x254/0x380 netlink_unicast+0x454/0x610 netlink_sendmsg+0x747/0xb00 sock_sendmsg+0xe2/0x120 __sys_sendto+0x1ae/0x290 __x64_sys_sendto+0xdd/0x1b0 do_syscall_64+0xa0/0x370 entry_SYSCALL_64_after_hwframe+0x65/0xca Fixes: 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()") Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Ke Xiao Signed-off-by: Ding Hui Cc: Di Zhu Reviewed-by: Jan Sokolowski Reviewed-by: Simon Horman Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 75a553f4e26f6..552f5025d265b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -99,12 +99,18 @@ static struct workqueue_struct *i40e_wq; static void netdev_hw_addr_refcnt(struct i40e_mac_filter *f, struct net_device *netdev, int delta) { + struct netdev_hw_addr_list *ha_list; struct netdev_hw_addr *ha; if (!f || !netdev) return; - netdev_for_each_mc_addr(ha, netdev) { + if (is_unicast_ether_addr(f->macaddr) || is_link_local_ether_addr(f->macaddr)) + ha_list = &netdev->uc; + else + ha_list = &netdev->mc; + + netdev_hw_addr_list_for_each(ha, ha_list) { if (ether_addr_equal(ha->addr, f->macaddr)) { ha->refcount += delta; if (ha->refcount <= 0) From d94b35454b1ff91f3d1cb12198b00b71ec3341aa Mon Sep 17 00:00:00 2001 From: Andrii Staikov Date: Thu, 21 Dec 2023 14:27:35 +0100 Subject: [PATCH 08/26] i40e: Restore VF MSI-X state during PCI reset [ Upstream commit 371e576ff3e8580d91d49026e5d5faebf5565558 ] During a PCI FLR the MSI-X Enable flag in the VF PCI MSI-X capability register will be cleared. This can lead to issues when a VF is assigned to a VM because in these cases the VF driver receives no indication of the PF PCI error/reset and additionally it is incapable of restoring the cleared flag in the hypervisor configuration space without fully reinitializing the driver interrupt functionality. Since the VF driver is unable to easily resolve this condition on its own, restore the VF MSI-X flag during the PF PCI reset handling. Fixes: 19b7960b2da1 ("i40e: implement split PCI error reset handler") Co-developed-by: Karen Ostrowska Signed-off-by: Karen Ostrowska Co-developed-by: Mateusz Palczewski Signed-off-by: Mateusz Palczewski Reviewed-by: Wojciech Drewek Reviewed-by: Przemek Kitszel Signed-off-by: Andrii Staikov Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++++++++++++++ .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 3 +++ 3 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 552f5025d265b..97cf144a4ff99 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14476,6 +14476,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev) struct i40e_pf *pf = pci_get_drvdata(pdev); i40e_reset_and_rebuild(pf, false, false); +#ifdef CONFIG_PCI_IOV + i40e_restore_all_vfs_msi_state(pdev); +#endif /* CONFIG_PCI_IOV */ } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 32b19c4c581b6..412f8002f918b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -99,6 +99,32 @@ void i40e_vc_notify_reset(struct i40e_pf *pf) (u8 *)&pfe, sizeof(struct virtchnl_pf_event)); } +#ifdef CONFIG_PCI_IOV +void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev) +{ + u16 vf_id; + u16 pos; + + /* Continue only if this is a PF */ + if (!pdev->is_physfn) + return; + + if (!pci_num_vf(pdev)) + return; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + if (pos) { + struct pci_dev *vf_dev = NULL; + + pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID, &vf_id); + while ((vf_dev = pci_get_device(pdev->vendor, vf_id, vf_dev))) { + if (vf_dev->is_virtfn && vf_dev->physfn == pdev) + pci_restore_msi_state(vf_dev); + } + } +} +#endif /* CONFIG_PCI_IOV */ + /** * i40e_vc_notify_vf_reset * @vf: pointer to the VF structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 1e001b2bd761b..c9e0a591a344d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -137,5 +137,8 @@ int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable); void i40e_vc_notify_link_state(struct i40e_pf *pf); void i40e_vc_notify_reset(struct i40e_pf *pf); +#ifdef CONFIG_PCI_IOV +void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev); +#endif /* CONFIG_PCI_IOV */ #endif /* _I40E_VIRTCHNL_PF_H_ */ From 4c731d37cdd8bb66e4a6cff0ce69eb6ecf4e7620 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 17 Jan 2021 09:15:42 +0100 Subject: [PATCH 09/26] net/qla3xxx: switch from 'pci_' to 'dma_' API [ Upstream commit 41fb4c1ba7478fe34c7e094e124e4ee4513b9763 ] The wrappers in include/linux/pci-dma-compat.h should go away. The patch has been generated with the coccinelle script below and has been hand modified to replace GFP_ with a correct flag. It has been compile tested. When memory is allocated in 'ql_alloc_net_req_rsp_queues()' GFP_KERNEL can be used because it is only called from 'ql_alloc_mem_resources()' which already calls 'ql_alloc_buffer_queues()' which uses GFP_KERNEL. (see below) When memory is allocated in 'ql_alloc_buffer_queues()' GFP_KERNEL can be used because this flag is already used just a few line above. When memory is allocated in 'ql_alloc_small_buffers()' GFP_KERNEL can be used because it is only called from 'ql_alloc_mem_resources()' which already calls 'ql_alloc_buffer_queues()' which uses GFP_KERNEL. (see above) When memory is allocated in 'ql_alloc_mem_resources()' GFP_KERNEL can be used because this function already calls 'ql_alloc_buffer_queues()' which uses GFP_KERNEL. (see above) While at it, use 'dma_set_mask_and_coherent()' instead of 'dma_set_mask()/ dma_set_coherent_mask()' in order to slightly simplify code. @@ @@ - PCI_DMA_BIDIRECTIONAL + DMA_BIDIRECTIONAL @@ @@ - PCI_DMA_TODEVICE + DMA_TO_DEVICE @@ @@ - PCI_DMA_FROMDEVICE + DMA_FROM_DEVICE @@ @@ - PCI_DMA_NONE + DMA_NONE @@ expression e1, e2, e3; @@ - pci_alloc_consistent(e1, e2, e3) + dma_alloc_coherent(&e1->dev, e2, e3, GFP_) @@ expression e1, e2, e3; @@ - pci_zalloc_consistent(e1, e2, e3) + dma_alloc_coherent(&e1->dev, e2, e3, GFP_) @@ expression e1, e2, e3, e4; @@ - pci_free_consistent(e1, e2, e3, e4) + dma_free_coherent(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_map_single(e1, e2, e3, e4) + dma_map_single(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_unmap_single(e1, e2, e3, e4) + dma_unmap_single(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4, e5; @@ - pci_map_page(e1, e2, e3, e4, e5) + dma_map_page(&e1->dev, e2, e3, e4, e5) @@ expression e1, e2, e3, e4; @@ - pci_unmap_page(e1, e2, e3, e4) + dma_unmap_page(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_map_sg(e1, e2, e3, e4) + dma_map_sg(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_unmap_sg(e1, e2, e3, e4) + dma_unmap_sg(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_single_for_cpu(e1, e2, e3, e4) + dma_sync_single_for_cpu(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_single_for_device(e1, e2, e3, e4) + dma_sync_single_for_device(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_sg_for_cpu(e1, e2, e3, e4) + dma_sync_sg_for_cpu(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_dma_sync_sg_for_device(e1, e2, e3, e4) + dma_sync_sg_for_device(&e1->dev, e2, e3, e4) @@ expression e1, e2; @@ - pci_dma_mapping_error(e1, e2) + dma_mapping_error(&e1->dev, e2) @@ expression e1, e2; @@ - pci_set_dma_mask(e1, e2) + dma_set_mask(&e1->dev, e2) @@ expression e1, e2; @@ - pci_set_consistent_dma_mask(e1, e2) + dma_set_coherent_mask(&e1->dev, e2) Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20210117081542.560021-1-christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski Stable-dep-of: 89f45c30172c ("net/qla3xxx: fix potential memleak in ql_alloc_buffer_queues") Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qla3xxx.c | 196 ++++++++++++-------------- 1 file changed, 87 insertions(+), 109 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index d545593354c63..6350872fd5a52 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -316,12 +316,11 @@ static void ql_release_to_lrg_buf_free_list(struct ql3_adapter *qdev, * buffer */ skb_reserve(lrg_buf_cb->skb, QL_HEADER_SPACE); - map = pci_map_single(qdev->pdev, + map = dma_map_single(&qdev->pdev->dev, lrg_buf_cb->skb->data, - qdev->lrg_buffer_len - - QL_HEADER_SPACE, - PCI_DMA_FROMDEVICE); - err = pci_dma_mapping_error(qdev->pdev, map); + qdev->lrg_buffer_len - QL_HEADER_SPACE, + DMA_FROM_DEVICE); + err = dma_mapping_error(&qdev->pdev->dev, map); if (err) { netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n", @@ -1803,13 +1802,12 @@ static int ql_populate_free_queue(struct ql3_adapter *qdev) * first buffer */ skb_reserve(lrg_buf_cb->skb, QL_HEADER_SPACE); - map = pci_map_single(qdev->pdev, + map = dma_map_single(&qdev->pdev->dev, lrg_buf_cb->skb->data, - qdev->lrg_buffer_len - - QL_HEADER_SPACE, - PCI_DMA_FROMDEVICE); + qdev->lrg_buffer_len - QL_HEADER_SPACE, + DMA_FROM_DEVICE); - err = pci_dma_mapping_error(qdev->pdev, map); + err = dma_mapping_error(&qdev->pdev->dev, map); if (err) { netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n", @@ -1945,18 +1943,16 @@ static void ql_process_mac_tx_intr(struct ql3_adapter *qdev, goto invalid_seg_count; } - pci_unmap_single(qdev->pdev, + dma_unmap_single(&qdev->pdev->dev, dma_unmap_addr(&tx_cb->map[0], mapaddr), - dma_unmap_len(&tx_cb->map[0], maplen), - PCI_DMA_TODEVICE); + dma_unmap_len(&tx_cb->map[0], maplen), DMA_TO_DEVICE); tx_cb->seg_count--; if (tx_cb->seg_count) { for (i = 1; i < tx_cb->seg_count; i++) { - pci_unmap_page(qdev->pdev, - dma_unmap_addr(&tx_cb->map[i], - mapaddr), + dma_unmap_page(&qdev->pdev->dev, + dma_unmap_addr(&tx_cb->map[i], mapaddr), dma_unmap_len(&tx_cb->map[i], maplen), - PCI_DMA_TODEVICE); + DMA_TO_DEVICE); } } qdev->ndev->stats.tx_packets++; @@ -2023,10 +2019,9 @@ static void ql_process_mac_rx_intr(struct ql3_adapter *qdev, qdev->ndev->stats.rx_bytes += length; skb_put(skb, length); - pci_unmap_single(qdev->pdev, + dma_unmap_single(&qdev->pdev->dev, dma_unmap_addr(lrg_buf_cb2, mapaddr), - dma_unmap_len(lrg_buf_cb2, maplen), - PCI_DMA_FROMDEVICE); + dma_unmap_len(lrg_buf_cb2, maplen), DMA_FROM_DEVICE); prefetch(skb->data); skb_checksum_none_assert(skb); skb->protocol = eth_type_trans(skb, qdev->ndev); @@ -2069,10 +2064,9 @@ static void ql_process_macip_rx_intr(struct ql3_adapter *qdev, skb2 = lrg_buf_cb2->skb; skb_put(skb2, length); /* Just the second buffer length here. */ - pci_unmap_single(qdev->pdev, + dma_unmap_single(&qdev->pdev->dev, dma_unmap_addr(lrg_buf_cb2, mapaddr), - dma_unmap_len(lrg_buf_cb2, maplen), - PCI_DMA_FROMDEVICE); + dma_unmap_len(lrg_buf_cb2, maplen), DMA_FROM_DEVICE); prefetch(skb2->data); skb_checksum_none_assert(skb2); @@ -2321,9 +2315,9 @@ static int ql_send_map(struct ql3_adapter *qdev, /* * Map the skb buffer first. */ - map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE); + map = dma_map_single(&qdev->pdev->dev, skb->data, len, DMA_TO_DEVICE); - err = pci_dma_mapping_error(qdev->pdev, map); + err = dma_mapping_error(&qdev->pdev->dev, map); if (err) { netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n", err); @@ -2359,11 +2353,11 @@ static int ql_send_map(struct ql3_adapter *qdev, (seg == 7 && seg_cnt > 8) || (seg == 12 && seg_cnt > 13) || (seg == 17 && seg_cnt > 18)) { - map = pci_map_single(qdev->pdev, oal, + map = dma_map_single(&qdev->pdev->dev, oal, sizeof(struct oal), - PCI_DMA_TODEVICE); + DMA_TO_DEVICE); - err = pci_dma_mapping_error(qdev->pdev, map); + err = dma_mapping_error(&qdev->pdev->dev, map); if (err) { netdev_err(qdev->ndev, "PCI mapping outbound address list with error: %d\n", @@ -2425,24 +2419,24 @@ static int ql_send_map(struct ql3_adapter *qdev, (seg == 7 && seg_cnt > 8) || (seg == 12 && seg_cnt > 13) || (seg == 17 && seg_cnt > 18)) { - pci_unmap_single(qdev->pdev, - dma_unmap_addr(&tx_cb->map[seg], mapaddr), - dma_unmap_len(&tx_cb->map[seg], maplen), - PCI_DMA_TODEVICE); + dma_unmap_single(&qdev->pdev->dev, + dma_unmap_addr(&tx_cb->map[seg], mapaddr), + dma_unmap_len(&tx_cb->map[seg], maplen), + DMA_TO_DEVICE); oal++; seg++; } - pci_unmap_page(qdev->pdev, + dma_unmap_page(&qdev->pdev->dev, dma_unmap_addr(&tx_cb->map[seg], mapaddr), dma_unmap_len(&tx_cb->map[seg], maplen), - PCI_DMA_TODEVICE); + DMA_TO_DEVICE); } - pci_unmap_single(qdev->pdev, + dma_unmap_single(&qdev->pdev->dev, dma_unmap_addr(&tx_cb->map[0], mapaddr), dma_unmap_addr(&tx_cb->map[0], maplen), - PCI_DMA_TODEVICE); + DMA_TO_DEVICE); return NETDEV_TX_BUSY; @@ -2528,9 +2522,8 @@ static int ql_alloc_net_req_rsp_queues(struct ql3_adapter *qdev) wmb(); qdev->req_q_virt_addr = - pci_alloc_consistent(qdev->pdev, - (size_t) qdev->req_q_size, - &qdev->req_q_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, (size_t)qdev->req_q_size, + &qdev->req_q_phy_addr, GFP_KERNEL); if ((qdev->req_q_virt_addr == NULL) || LS_64BITS(qdev->req_q_phy_addr) & (qdev->req_q_size - 1)) { @@ -2539,16 +2532,14 @@ static int ql_alloc_net_req_rsp_queues(struct ql3_adapter *qdev) } qdev->rsp_q_virt_addr = - pci_alloc_consistent(qdev->pdev, - (size_t) qdev->rsp_q_size, - &qdev->rsp_q_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, (size_t)qdev->rsp_q_size, + &qdev->rsp_q_phy_addr, GFP_KERNEL); if ((qdev->rsp_q_virt_addr == NULL) || LS_64BITS(qdev->rsp_q_phy_addr) & (qdev->rsp_q_size - 1)) { netdev_err(qdev->ndev, "rspQ allocation failed\n"); - pci_free_consistent(qdev->pdev, (size_t) qdev->req_q_size, - qdev->req_q_virt_addr, - qdev->req_q_phy_addr); + dma_free_coherent(&qdev->pdev->dev, (size_t)qdev->req_q_size, + qdev->req_q_virt_addr, qdev->req_q_phy_addr); return -ENOMEM; } @@ -2564,15 +2555,13 @@ static void ql_free_net_req_rsp_queues(struct ql3_adapter *qdev) return; } - pci_free_consistent(qdev->pdev, - qdev->req_q_size, - qdev->req_q_virt_addr, qdev->req_q_phy_addr); + dma_free_coherent(&qdev->pdev->dev, qdev->req_q_size, + qdev->req_q_virt_addr, qdev->req_q_phy_addr); qdev->req_q_virt_addr = NULL; - pci_free_consistent(qdev->pdev, - qdev->rsp_q_size, - qdev->rsp_q_virt_addr, qdev->rsp_q_phy_addr); + dma_free_coherent(&qdev->pdev->dev, qdev->rsp_q_size, + qdev->rsp_q_virt_addr, qdev->rsp_q_phy_addr); qdev->rsp_q_virt_addr = NULL; @@ -2596,9 +2585,9 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) return -ENOMEM; qdev->lrg_buf_q_alloc_virt_addr = - pci_alloc_consistent(qdev->pdev, - qdev->lrg_buf_q_alloc_size, - &qdev->lrg_buf_q_alloc_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, + qdev->lrg_buf_q_alloc_size, + &qdev->lrg_buf_q_alloc_phy_addr, GFP_KERNEL); if (qdev->lrg_buf_q_alloc_virt_addr == NULL) { netdev_err(qdev->ndev, "lBufQ failed\n"); @@ -2616,15 +2605,16 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) qdev->small_buf_q_alloc_size = qdev->small_buf_q_size * 2; qdev->small_buf_q_alloc_virt_addr = - pci_alloc_consistent(qdev->pdev, - qdev->small_buf_q_alloc_size, - &qdev->small_buf_q_alloc_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, + qdev->small_buf_q_alloc_size, + &qdev->small_buf_q_alloc_phy_addr, GFP_KERNEL); if (qdev->small_buf_q_alloc_virt_addr == NULL) { netdev_err(qdev->ndev, "Small Buffer Queue allocation failed\n"); - pci_free_consistent(qdev->pdev, qdev->lrg_buf_q_alloc_size, - qdev->lrg_buf_q_alloc_virt_addr, - qdev->lrg_buf_q_alloc_phy_addr); + dma_free_coherent(&qdev->pdev->dev, + qdev->lrg_buf_q_alloc_size, + qdev->lrg_buf_q_alloc_virt_addr, + qdev->lrg_buf_q_alloc_phy_addr); return -ENOMEM; } @@ -2641,17 +2631,15 @@ static void ql_free_buffer_queues(struct ql3_adapter *qdev) return; } kfree(qdev->lrg_buf); - pci_free_consistent(qdev->pdev, - qdev->lrg_buf_q_alloc_size, - qdev->lrg_buf_q_alloc_virt_addr, - qdev->lrg_buf_q_alloc_phy_addr); + dma_free_coherent(&qdev->pdev->dev, qdev->lrg_buf_q_alloc_size, + qdev->lrg_buf_q_alloc_virt_addr, + qdev->lrg_buf_q_alloc_phy_addr); qdev->lrg_buf_q_virt_addr = NULL; - pci_free_consistent(qdev->pdev, - qdev->small_buf_q_alloc_size, - qdev->small_buf_q_alloc_virt_addr, - qdev->small_buf_q_alloc_phy_addr); + dma_free_coherent(&qdev->pdev->dev, qdev->small_buf_q_alloc_size, + qdev->small_buf_q_alloc_virt_addr, + qdev->small_buf_q_alloc_phy_addr); qdev->small_buf_q_virt_addr = NULL; @@ -2669,9 +2657,9 @@ static int ql_alloc_small_buffers(struct ql3_adapter *qdev) QL_SMALL_BUFFER_SIZE); qdev->small_buf_virt_addr = - pci_alloc_consistent(qdev->pdev, - qdev->small_buf_total_size, - &qdev->small_buf_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, + qdev->small_buf_total_size, + &qdev->small_buf_phy_addr, GFP_KERNEL); if (qdev->small_buf_virt_addr == NULL) { netdev_err(qdev->ndev, "Failed to get small buffer memory\n"); @@ -2704,10 +2692,10 @@ static void ql_free_small_buffers(struct ql3_adapter *qdev) return; } if (qdev->small_buf_virt_addr != NULL) { - pci_free_consistent(qdev->pdev, - qdev->small_buf_total_size, - qdev->small_buf_virt_addr, - qdev->small_buf_phy_addr); + dma_free_coherent(&qdev->pdev->dev, + qdev->small_buf_total_size, + qdev->small_buf_virt_addr, + qdev->small_buf_phy_addr); qdev->small_buf_virt_addr = NULL; } @@ -2722,10 +2710,10 @@ static void ql_free_large_buffers(struct ql3_adapter *qdev) lrg_buf_cb = &qdev->lrg_buf[i]; if (lrg_buf_cb->skb) { dev_kfree_skb(lrg_buf_cb->skb); - pci_unmap_single(qdev->pdev, + dma_unmap_single(&qdev->pdev->dev, dma_unmap_addr(lrg_buf_cb, mapaddr), dma_unmap_len(lrg_buf_cb, maplen), - PCI_DMA_FROMDEVICE); + DMA_FROM_DEVICE); memset(lrg_buf_cb, 0, sizeof(struct ql_rcv_buf_cb)); } else { break; @@ -2777,13 +2765,11 @@ static int ql_alloc_large_buffers(struct ql3_adapter *qdev) * buffer */ skb_reserve(skb, QL_HEADER_SPACE); - map = pci_map_single(qdev->pdev, - skb->data, - qdev->lrg_buffer_len - - QL_HEADER_SPACE, - PCI_DMA_FROMDEVICE); + map = dma_map_single(&qdev->pdev->dev, skb->data, + qdev->lrg_buffer_len - QL_HEADER_SPACE, + DMA_FROM_DEVICE); - err = pci_dma_mapping_error(qdev->pdev, map); + err = dma_mapping_error(&qdev->pdev->dev, map); if (err) { netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n", @@ -2868,8 +2854,8 @@ static int ql_alloc_mem_resources(struct ql3_adapter *qdev) * Network Completion Queue Producer Index Register */ qdev->shadow_reg_virt_addr = - pci_alloc_consistent(qdev->pdev, - PAGE_SIZE, &qdev->shadow_reg_phy_addr); + dma_alloc_coherent(&qdev->pdev->dev, PAGE_SIZE, + &qdev->shadow_reg_phy_addr, GFP_KERNEL); if (qdev->shadow_reg_virt_addr != NULL) { qdev->preq_consumer_index = qdev->shadow_reg_virt_addr; @@ -2924,10 +2910,9 @@ static int ql_alloc_mem_resources(struct ql3_adapter *qdev) err_buffer_queues: ql_free_net_req_rsp_queues(qdev); err_req_rsp: - pci_free_consistent(qdev->pdev, - PAGE_SIZE, - qdev->shadow_reg_virt_addr, - qdev->shadow_reg_phy_addr); + dma_free_coherent(&qdev->pdev->dev, PAGE_SIZE, + qdev->shadow_reg_virt_addr, + qdev->shadow_reg_phy_addr); return -ENOMEM; } @@ -2940,10 +2925,9 @@ static void ql_free_mem_resources(struct ql3_adapter *qdev) ql_free_buffer_queues(qdev); ql_free_net_req_rsp_queues(qdev); if (qdev->shadow_reg_virt_addr != NULL) { - pci_free_consistent(qdev->pdev, - PAGE_SIZE, - qdev->shadow_reg_virt_addr, - qdev->shadow_reg_phy_addr); + dma_free_coherent(&qdev->pdev->dev, PAGE_SIZE, + qdev->shadow_reg_virt_addr, + qdev->shadow_reg_phy_addr); qdev->shadow_reg_virt_addr = NULL; } } @@ -3644,18 +3628,15 @@ static void ql_reset_work(struct work_struct *work) if (tx_cb->skb) { netdev_printk(KERN_DEBUG, ndev, "Freeing lost SKB\n"); - pci_unmap_single(qdev->pdev, - dma_unmap_addr(&tx_cb->map[0], - mapaddr), - dma_unmap_len(&tx_cb->map[0], maplen), - PCI_DMA_TODEVICE); + dma_unmap_single(&qdev->pdev->dev, + dma_unmap_addr(&tx_cb->map[0], mapaddr), + dma_unmap_len(&tx_cb->map[0], maplen), + DMA_TO_DEVICE); for (j = 1; j < tx_cb->seg_count; j++) { - pci_unmap_page(qdev->pdev, - dma_unmap_addr(&tx_cb->map[j], - mapaddr), - dma_unmap_len(&tx_cb->map[j], - maplen), - PCI_DMA_TODEVICE); + dma_unmap_page(&qdev->pdev->dev, + dma_unmap_addr(&tx_cb->map[j], mapaddr), + dma_unmap_len(&tx_cb->map[j], maplen), + DMA_TO_DEVICE); } dev_kfree_skb(tx_cb->skb); tx_cb->skb = NULL; @@ -3787,13 +3768,10 @@ static int ql3xxx_probe(struct pci_dev *pdev, pci_set_master(pdev); - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) pci_using_dac = 1; - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - } else if (!(err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) { + else if (!(err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))) pci_using_dac = 0; - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - } if (err) { pr_err("%s no usable DMA configuration\n", pci_name(pdev)); From 39437c89a3074b3e869dab2d0e65148923e327a5 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 27 Dec 2023 15:02:27 +0800 Subject: [PATCH 10/26] net/qla3xxx: fix potential memleak in ql_alloc_buffer_queues [ Upstream commit 89f45c30172c80e55c887f32f1af8e184124577b ] When dma_alloc_coherent() fails, we should free qdev->lrg_buf to prevent potential memleak. Fixes: 1357bfcf7106 ("qla3xxx: Dynamically size the rx buffer queue based on the MTU.") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20231227070227.10527-1-dinghao.liu@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qla3xxx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 6350872fd5a52..99949140c2e70 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -2591,6 +2591,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) if (qdev->lrg_buf_q_alloc_virt_addr == NULL) { netdev_err(qdev->ndev, "lBufQ failed\n"); + kfree(qdev->lrg_buf); return -ENOMEM; } qdev->lrg_buf_q_virt_addr = qdev->lrg_buf_q_alloc_virt_addr; @@ -2615,6 +2616,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) qdev->lrg_buf_q_alloc_size, qdev->lrg_buf_q_alloc_virt_addr, qdev->lrg_buf_q_alloc_phy_addr); + kfree(qdev->lrg_buf); return -ENOMEM; } From 1d267835dadc8cec3bd3da252171bb335f507cd3 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 3 Jan 2024 03:35:34 +0000 Subject: [PATCH 11/26] asix: Add check for usbnet_get_endpoints [ Upstream commit eaac6a2d26b65511e164772bec6918fcbc61938e ] Add check for usbnet_get_endpoints() and return the error if it fails in order to transfer the error. Fixes: 16626b0cc3d5 ("asix: Add a new driver for the AX88172A") Signed-off-by: Chen Ni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/ax88172a.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c index 909755ef71ac3..5881620e44365 100644 --- a/drivers/net/usb/ax88172a.c +++ b/drivers/net/usb/ax88172a.c @@ -198,7 +198,9 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf) u8 buf[ETH_ALEN]; struct ax88172a_private *priv; - usbnet_get_endpoints(dev, intf); + ret = usbnet_get_endpoints(dev, intf); + if (ret) + return ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) From b39d97468acf9651ff68e44f0e0c7ebac4ecdca7 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 3 Jan 2024 16:59:24 -0800 Subject: [PATCH 12/26] bnxt_en: Remove mis-applied code from bnxt_cfg_ntp_filters() [ Upstream commit e009b2efb7a8850498796b360043ac25c8d3d28f ] The 2 lines to check for the BNXT_HWRM_PF_UNLOAD_SP_EVENT bit was mis-applied to bnxt_cfg_ntp_filters() and should have been applied to bnxt_sp_task(). Fixes: 19241368443f ("bnxt_en: Send PF driver unload notification to all VFs.") Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d79281c6d915d..ca817b251ef58 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7847,6 +7847,8 @@ static void bnxt_sp_task(struct work_struct *work) bnxt_cfg_ntp_filters(bp); if (test_and_clear_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event)) bnxt_hwrm_exec_fwd_req(bp); + if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) + netdev_info(bp->dev, "Receive PF driver unload event!\n"); if (test_and_clear_bit(BNXT_VXLAN_ADD_PORT_SP_EVENT, &bp->sp_event)) { bnxt_hwrm_tunnel_dst_port_alloc( bp, bp->vxlan_port, @@ -8407,8 +8409,6 @@ static void bnxt_cfg_ntp_filters(struct bnxt *bp) } } } - if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) - netdev_info(bp->dev, "Receive PF driver unload event!"); } #else From c6f50413f2aacc919b5de443aa080b94f5ebb21d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Dec 2023 13:58:36 +0000 Subject: [PATCH 13/26] mm/memory-failure: check the mapcount of the precise page [ Upstream commit c79c5a0a00a9457718056b588f312baadf44e471 ] A process may map only some of the pages in a folio, and might be missed if it maps the poisoned page but not the head page. Or it might be unnecessarily hit if it maps the head page, but not the poisoned page. Link: https://lkml.kernel.org/r/20231218135837.3310403-3-willy@infradead.org Fixes: 7af446a841a2 ("HWPOISON, hugetlb: enable error handling path for hugepage") Signed-off-by: Matthew Wilcox (Oracle) Cc: Dan Williams Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3da3c63dccd1a..c971d5e11f932 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -989,7 +989,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(hpage)) + if (!page_mapped(p)) return true; if (PageKsm(p)) { @@ -1033,10 +1033,10 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - unmap_success = try_to_unmap(hpage, ttu); + unmap_success = try_to_unmap(p, ttu); if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(p)); /* * try_to_unmap() might put mlocked page in lru cache, so call From 50f8b0a96c88f78c2e40cdb93557385c2ce5f5e8 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 2 Jan 2024 20:01:50 +0900 Subject: [PATCH 14/26] firewire: ohci: suppress unexpected system reboot in AMD Ryzen machines and ASM108x/VT630x PCIe cards commit ac9184fbb8478dab4a0724b279f94956b69be827 upstream. VIA VT6306/6307/6308 provides PCI interface compliant to 1394 OHCI. When the hardware is combined with Asmedia ASM1083/1085 PCIe-to-PCI bus bridge, it appears that accesses to its 'Isochronous Cycle Timer' register (offset 0xf0 on PCI memory space) often causes unexpected system reboot in any type of AMD Ryzen machine (both 0x17 and 0x19 families). It does not appears in the other type of machine (AMD pre-Ryzen machine, Intel machine, at least), or in the other OHCI 1394 hardware (e.g. Texas Instruments). The issue explicitly appears at a commit dcadfd7f7c74 ("firewire: core: use union for callback of transaction completion") added to v6.5 kernel. It changed 1394 OHCI driver to access to the register every time to dispatch local asynchronous transaction. However, the issue exists in older version of kernel as long as it runs in AMD Ryzen machine, since the access to the register is required to maintain bus time. It is not hard to imagine that users experience the unexpected system reboot when generating bus reset by plugging any devices in, or reading the register by time-aware application programs; e.g. audio sample processing. This commit suppresses the unexpected system reboot in the combination of hardware. It avoids the access itself. As a result, the software stack can not provide the hardware time anymore to unit drivers, userspace applications, and nodes in the same IEEE 1394 bus. It brings apparent disadvantage since time-aware application programs require it, while time-unaware applications are available again; e.g. sbp2. Cc: stable@vger.kernel.org Reported-by: Jiri Slaby Closes: https://bugzilla.suse.com/show_bug.cgi?id=1215436 Reported-by: Mario Limonciello Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217994 Reported-by: Tobias Gruetzmacher Closes: https://sourceforge.net/p/linux1394/mailman/message/58711901/ Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2240973 Closes: https://bugs.launchpad.net/linux/+bug/2043905 Link: https://lore.kernel.org/r/20240102110150.244475-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto Signed-off-by: Greg Kroah-Hartman --- drivers/firewire/ohci.c | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 9e7abc86de8c2..9807a885e698c 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -292,6 +292,51 @@ static char ohci_driver_name[] = KBUILD_MODNAME; #define QUIRK_TI_SLLZ059 0x20 #define QUIRK_IR_WAKE 0x40 +// On PCI Express Root Complex in any type of AMD Ryzen machine, VIA VT6306/6307/6308 with Asmedia +// ASM1083/1085 brings an inconvenience that the read accesses to 'Isochronous Cycle Timer' register +// (at offset 0xf0 in PCI I/O space) often causes unexpected system reboot. The mechanism is not +// clear, since the read access to the other registers is enough safe; e.g. 'Node ID' register, +// while it is probable due to detection of any type of PCIe error. +#define QUIRK_REBOOT_BY_CYCLE_TIMER_READ 0x80000000 + +#if IS_ENABLED(CONFIG_X86) + +static bool has_reboot_by_cycle_timer_read_quirk(const struct fw_ohci *ohci) +{ + return !!(ohci->quirks & QUIRK_REBOOT_BY_CYCLE_TIMER_READ); +} + +#define PCI_DEVICE_ID_ASMEDIA_ASM108X 0x1080 + +static bool detect_vt630x_with_asm1083_on_amd_ryzen_machine(const struct pci_dev *pdev) +{ + const struct pci_dev *pcie_to_pci_bridge; + + // Detect any type of AMD Ryzen machine. + if (!static_cpu_has(X86_FEATURE_ZEN)) + return false; + + // Detect VIA VT6306/6307/6308. + if (pdev->vendor != PCI_VENDOR_ID_VIA) + return false; + if (pdev->device != PCI_DEVICE_ID_VIA_VT630X) + return false; + + // Detect Asmedia ASM1083/1085. + pcie_to_pci_bridge = pdev->bus->self; + if (pcie_to_pci_bridge->vendor != PCI_VENDOR_ID_ASMEDIA) + return false; + if (pcie_to_pci_bridge->device != PCI_DEVICE_ID_ASMEDIA_ASM108X) + return false; + + return true; +} + +#else +#define has_reboot_by_cycle_timer_read_quirk(ohci) false +#define detect_vt630x_with_asm1083_on_amd_ryzen_machine(pdev) false +#endif + /* In case of multiple matches in ohci_quirks[], only the first one is used. */ static const struct { unsigned short vendor, device, revision, flags; @@ -1730,6 +1775,9 @@ static u32 get_cycle_time(struct fw_ohci *ohci) s32 diff01, diff12; int i; + if (has_reboot_by_cycle_timer_read_quirk(ohci)) + return 0; + c2 = reg_read(ohci, OHCI1394_IsochronousCycleTimer); if (ohci->quirks & QUIRK_CYCLE_TIMER) { @@ -3633,6 +3681,9 @@ static int pci_probe(struct pci_dev *dev, if (param_quirks) ohci->quirks = param_quirks; + if (detect_vt630x_with_asm1083_on_amd_ryzen_machine(dev)) + ohci->quirks |= QUIRK_REBOOT_BY_CYCLE_TIMER_READ; + /* * Because dma_alloc_coherent() allocates at least one page, * we save space by using a common buffer for the AR request/ From 2db1c46c3913b8bc92fed235a344de2671fe9d8d Mon Sep 17 00:00:00 2001 From: Jiajun Xie Date: Wed, 20 Dec 2023 13:28:39 +0800 Subject: [PATCH 15/26] mm: fix unmap_mapping_range high bits shift bug commit 9eab0421fa94a3dde0d1f7e36ab3294fc306c99d upstream. The bug happens when highest bit of holebegin is 1, suppose holebegin is 0x8000000111111000, after shift, hba would be 0xfff8000000111111, then vma_interval_tree_foreach would look it up fail or leads to the wrong result. error call seq e.g.: - mmap(..., offset=0x8000000111111000) |- syscall(mmap, ... unsigned long, off): |- ksys_mmap_pgoff( ... , off >> PAGE_SHIFT); here pgoff is correctly shifted to 0x8000000111111, but pass 0x8000000111111000 as holebegin to unmap would then cause terrible result, as shown below: - unmap_mapping_range(..., loff_t const holebegin) |- pgoff_t hba = holebegin >> PAGE_SHIFT; /* hba = 0xfff8000000111111 unexpectedly */ The issue happens in Heterogeneous computing, where the device(e.g. gpu) and host share the same virtual address space. A simple workflow pattern which hit the issue is: /* host */ 1. userspace first mmap a file backed VA range with specified offset. e.g. (offset=0x800..., mmap return: va_a) 2. write some data to the corresponding sys page e.g. (va_a = 0xAABB) /* device */ 3. gpu workload touches VA, triggers gpu fault and notify the host. /* host */ 4. reviced gpu fault notification, then it will: 4.1 unmap host pages and also takes care of cpu tlb (use unmap_mapping_range with offset=0x800...) 4.2 migrate sys page to device 4.3 setup device page table and resolve device fault. /* device */ 5. gpu workload continued, it accessed va_a and got 0xAABB. 6. gpu workload continued, it wrote 0xBBCC to va_a. /* host */ 7. userspace access va_a, as expected, it will: 7.1 trigger cpu vm fault. 7.2 driver handling fault to migrate gpu local page to host. 8. userspace then could correctly get 0xBBCC from va_a 9. done But in step 4.1, if we hit the bug this patch mentioned, then userspace would never trigger cpu fault, and still get the old value: 0xAABB. Making holebegin unsigned first fixes the bug. Link: https://lkml.kernel.org/r/20231220052839.26970-1-jiajun.xie.sh@gmail.com Signed-off-by: Jiajun Xie Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b80ce6b3c8f49..1e108db4405c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,8 +3042,8 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - pgoff_t hba = holebegin >> PAGE_SHIFT; - pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; + pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { From 4a2d650b8fb83b0ceb37279dcc7ceafd10b578a3 Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Fri, 1 Dec 2023 16:31:43 +0100 Subject: [PATCH 16/26] mmc: rpmb: fixes pause retune on all RPMB partitions. commit e7794c14fd73e5eb4a3e0ecaa5334d5a17377c50 upstream. When RPMB was converted to a character device, it added support for multiple RPMB partitions (Commit 97548575bef3 ("mmc: block: Convert RPMB to a character device"). One of the changes in this commit was transforming the variable target_part defined in __mmc_blk_ioctl_cmd into a bitmask. This inadvertently regressed the validation check done in mmc_blk_part_switch_pre() and mmc_blk_part_switch_post(), so let's fix it. Fixes: 97548575bef3 ("mmc: block: Convert RPMB to a character device") Signed-off-by: Jorge Ramirez-Ortiz Reviewed-by: Linus Walleij Cc: Link: https://lore.kernel.org/r/20231201153143.1449753-1-jorge@foundries.io Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/block.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index a5d986c75276d..ef9422917e1ca 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -851,9 +851,10 @@ static const struct block_device_operations mmc_bdops = { static int mmc_blk_part_switch_pre(struct mmc_card *card, unsigned int part_type) { + const unsigned int mask = EXT_CSD_PART_CONFIG_ACC_RPMB; int ret = 0; - if (part_type == EXT_CSD_PART_CONFIG_ACC_RPMB) { + if ((part_type & mask) == mask) { if (card->ext_csd.cmdq_en) { ret = mmc_cmdq_disable(card); if (ret) @@ -868,9 +869,10 @@ static int mmc_blk_part_switch_pre(struct mmc_card *card, static int mmc_blk_part_switch_post(struct mmc_card *card, unsigned int part_type) { + const unsigned int mask = EXT_CSD_PART_CONFIG_ACC_RPMB; int ret = 0; - if (part_type == EXT_CSD_PART_CONFIG_ACC_RPMB) { + if ((part_type & mask) == mask) { mmc_retune_unpause(card->host); if (card->reenable_cmdq && !card->ext_csd.cmdq_en) ret = mmc_cmdq_enable(card); @@ -3102,4 +3104,3 @@ module_exit(mmc_blk_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multimedia Card (MMC) block device driver"); - From 74f6bca1532e013ff5745414288088486b4e9251 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 4 Dec 2023 12:29:53 +0100 Subject: [PATCH 17/26] mmc: core: Cancel delayed work before releasing host commit 1036f69e251380573e256568cf814506e3fb9988 upstream. On RZ/Five SMARC EVK, where probing of SDHI is deferred due to probe deferral of the vqmmc-supply regulator: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/time/timer.c:1738 __run_timers.part.0+0x1d0/0x1e8 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.7.0-rc4 #101 Hardware name: Renesas SMARC EVK based on r9a07g043f01 (DT) epc : __run_timers.part.0+0x1d0/0x1e8 ra : __run_timers.part.0+0x134/0x1e8 epc : ffffffff800771a4 ra : ffffffff80077108 sp : ffffffc800003e60 gp : ffffffff814f5028 tp : ffffffff8140c5c0 t0 : ffffffc800000000 t1 : 0000000000000001 t2 : ffffffff81201300 s0 : ffffffc800003f20 s1 : ffffffd8023bc4a0 a0 : 00000000fffee6b0 a1 : 0004010000400000 a2 : ffffffffc0000016 a3 : ffffffff81488640 a4 : ffffffc800003e60 a5 : 0000000000000000 a6 : 0000000004000000 a7 : ffffffc800003e68 s2 : 0000000000000122 s3 : 0000000000200000 s4 : 0000000000000000 s5 : ffffffffffffffff s6 : ffffffff81488678 s7 : ffffffff814886c0 s8 : ffffffff814f49c0 s9 : ffffffff81488640 s10: 0000000000000000 s11: ffffffc800003e60 t3 : 0000000000000240 t4 : 0000000000000a52 t5 : ffffffd8024ae018 t6 : ffffffd8024ae038 status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [] __run_timers.part.0+0x1d0/0x1e8 [] run_timer_softirq+0x24/0x4a [] __do_softirq+0xc6/0x1fa [] irq_exit_rcu+0x66/0x84 [] handle_riscv_irq+0x40/0x4e [] call_on_irq_stack+0x1c/0x28 ---[ end trace 0000000000000000 ]--- What happens? renesas_sdhi_probe() { tmio_mmc_host_alloc() mmc_alloc_host() INIT_DELAYED_WORK(&host->detect, mmc_rescan); devm_request_irq(tmio_mmc_irq); /* * After this, the interrupt handler may be invoked at any time * * tmio_mmc_irq() * { * __tmio_mmc_card_detect_irq() * mmc_detect_change() * _mmc_detect_change() * mmc_schedule_delayed_work(&host->detect, delay); * } */ tmio_mmc_host_probe() tmio_mmc_init_ocr() -EPROBE_DEFER tmio_mmc_host_free() mmc_free_host() } When expire_timers() runs later, it warns because the MMC host structure containing the delayed work was freed, and now contains an invalid work function pointer. Fix this by cancelling any pending delayed work before releasing the MMC host structure. Signed-off-by: Geert Uytterhoeven Tested-by: Lad Prabhakar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/205dc4c91b47e31b64392fe2498c7a449e717b4b.1701689330.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 9de8a3553d424..3e94401c0eb38 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -489,6 +489,7 @@ EXPORT_SYMBOL(mmc_remove_host); */ void mmc_free_host(struct mmc_host *host) { + cancel_delayed_work_sync(&host->detect); mmc_pwrseq_free(host); put_device(&host->class_dev); } From 437d8898fbf40b92516aef697b5638acccb7a6f9 Mon Sep 17 00:00:00 2001 From: ruanmeisi Date: Tue, 25 Apr 2023 19:13:54 +0800 Subject: [PATCH 18/26] fuse: nlookup missing decrement in fuse_direntplus_link commit b8bd342d50cbf606666488488f9fea374aceb2d5 upstream. During our debugging of glusterfs, we found an Assertion failed error: inode_lookup >= nlookup, which was caused by the nlookup value in the kernel being greater than that in the FUSE file system. The issue was introduced by fuse_direntplus_link, where in the function, fuse_iget increments nlookup, and if d_splice_alias returns failure, fuse_direntplus_link returns failure without decrementing nlookup https://github.com/gluster/glusterfs/pull/4081 Signed-off-by: ruanmeisi Fixes: 0b05b18381ee ("fuse: implement NFS-like readdirplus support") Cc: # v3.9 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dir.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0e03adbcf9423..16252727ec2ee 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1314,8 +1314,16 @@ static int fuse_direntplus_link(struct file *file, dput(dentry); dentry = alias; } - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (!IS_ERR(inode)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->nlookup--; + spin_unlock(&fc->lock); + } return PTR_ERR(dentry); + } } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); From 087d38ae0fd5a9a41b949e97601b4b0d09336f19 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 16 Feb 2022 15:55:38 +0100 Subject: [PATCH 19/26] netfilter: nf_tables: Reject tables of unsupported family commit f1082dd31fe461d482d69da2a8eccfeb7bf07ac2 upstream. An nftables family is merely a hollow container, its family just a number and such not reliant on compile-time options other than nftables support itself. Add an artificial check so attempts at using a family the kernel can't support fail as early as possible. This helps user space detect kernels which lack e.g. NFPROTO_INET. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_tables_api.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3e30441162896..e0c224dea316e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -893,6 +893,30 @@ static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(chain->name, name); } +static bool nft_supported_family(u8 family) +{ + return false +#ifdef CONFIG_NF_TABLES_INET + || family == NFPROTO_INET +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + || family == NFPROTO_IPV4 +#endif +#ifdef CONFIG_NF_TABLES_ARP + || family == NFPROTO_ARP +#endif +#ifdef CONFIG_NF_TABLES_NETDEV + || family == NFPROTO_NETDEV +#endif +#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) + || family == NFPROTO_BRIDGE +#endif +#ifdef CONFIG_NF_TABLES_IPV6 + || family == NFPROTO_IPV6 +#endif + ; +} + static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -908,6 +932,9 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int err; + if (!nft_supported_family(family)) + return -EOPNOTSUPP; + lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask); From 5e2cf4c8777cb3fdafecdbce263ba5d975910f03 Mon Sep 17 00:00:00 2001 From: Bartosz Pawlowski Date: Fri, 8 Sep 2023 14:36:05 +0000 Subject: [PATCH 20/26] PCI: Extract ATS disabling to a helper function commit f18b1137d38c091cc8c16365219f0a1d4a30b3d1 upstream. Introduce quirk_no_ats() helper function to provide a standard way to disable ATS capability in PCI quirks. Suggested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230908143606.685930-2-bartosz.pawlowski@intel.com Signed-off-by: Bartosz Pawlowski Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 3a165710fbb86..20180894f82d9 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5182,6 +5182,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0420, quirk_no_ext_tags); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0422, quirk_no_ext_tags); #ifdef CONFIG_PCI_ATS +static void quirk_no_ats(struct pci_dev *pdev) +{ + pci_info(pdev, "disabling ATS\n"); + pdev->ats_cap = 0; +} + /* * Some devices require additional driver setup to enable ATS. Don't use * ATS for those devices as ATS will be enabled before the driver has had a @@ -5194,8 +5200,7 @@ static void quirk_amd_harvest_no_ats(struct pci_dev *pdev) (pdev->device == 0x7341 && pdev->revision != 0x00)) return; - pci_info(pdev, "disabling ATS\n"); - pdev->ats_cap = 0; + quirk_no_ats(pdev); } /* AMD Stoney platform GPU */ From 610003d742341dddca03a83beb7c61f1f079b8c5 Mon Sep 17 00:00:00 2001 From: Bartosz Pawlowski Date: Fri, 8 Sep 2023 14:36:06 +0000 Subject: [PATCH 21/26] PCI: Disable ATS for specific Intel IPU E2000 devices commit a18615b1cfc04f00548c60eb9a77e0ce56e848fd upstream. Due to a hardware issue in A and B steppings of Intel IPU E2000, it expects wrong endianness in ATS invalidation message body. This problem can lead to outdated translations being returned as valid and finally cause system instability. To prevent such issues, add quirk_intel_e2000_no_ats() to disable ATS for vulnerable IPU E2000 devices. Link: https://lore.kernel.org/r/20230908143606.685930-3-bartosz.pawlowski@intel.com Signed-off-by: Bartosz Pawlowski Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Alexander Lobakin Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 20180894f82d9..eb507751c115b 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5212,6 +5212,25 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7312, quirk_amd_harvest_no_ats); /* AMD Navi14 dGPU */ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7340, quirk_amd_harvest_no_ats); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7341, quirk_amd_harvest_no_ats); + +/* + * Intel IPU E2000 revisions before C0 implement incorrect endianness + * in ATS Invalidate Request message body. Disable ATS for those devices. + */ +static void quirk_intel_e2000_no_ats(struct pci_dev *pdev) +{ + if (pdev->revision < 0x20) + quirk_no_ats(pdev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1451, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1452, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1453, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1454, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1455, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1457, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1459, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x145a, quirk_intel_e2000_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x145c, quirk_intel_e2000_no_ats); #endif /* CONFIG_PCI_ATS */ /* Freescale PCIe doesn't support MSI in RC mode */ From 47468fae2704151503214f4d4327c164118247fd Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 12 Jan 2024 16:53:05 -0800 Subject: [PATCH 22/26] net: add a route cache full diagnostic message commit 22c2ad616b74f3de2256b242572ab449d031d941 upstream. In some testing scenarios, dst/route cache can fill up so quickly that even an explicit GC call occasionally fails to clean it up. This leads to sporadically failing calls to dst_alloc and "network unreachable" errors to the user, which is confusing. This patch adds a diagnostic message to make the cause of the failure easier to determine. Signed-off-by: Peter Oskolkov Signed-off-by: David S. Miller Signed-off-by: Suraj Jitindar Singh Cc: # 4.19.x Signed-off-by: Greg Kroah-Hartman --- net/core/dst.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/dst.c b/net/core/dst.c index 81ccf20e28265..a263309df115e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -98,8 +98,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, struct dst_entry *dst; if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) + if (ops->gc(ops)) { + printk_ratelimited(KERN_NOTICE "Route cache is full: " + "consider increasing sysctl " + "net.ipv[4|6].route.max_size.\n"); return NULL; + } } dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); From 9635bd0a5296e2e725c6b33e530d0ef582e2f64e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 16:53:06 -0800 Subject: [PATCH 23/26] net/dst: use a smaller percpu_counter batch for dst entries accounting commit cf86a086a18095e33e0637cb78cda1fcf5280852 upstream. percpu_counter_add() uses a default batch size which is quite big on platforms with 256 cpus. (2*256 -> 512) This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2) (131072 on servers with 256 cpus) Reduce the batch size to something more reasonable, and add logic to ip6_dst_gc() to call dst_entries_get_slow() before calling the _very_ expensive fib6_run_gc() function. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: Suraj Jitindar Singh Cc: # 4.19.x Signed-off-by: Greg Kroah-Hartman --- include/net/dst_ops.h | 4 +++- net/core/dst.c | 8 ++++---- net/ipv6/route.c | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 443863c7b8da3..88ff7bb2bb9bd 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(struct dst_ops *dst) return percpu_counter_sum_positive(&dst->pcpuc_entries); } +#define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { - percpu_counter_add(&dst->pcpuc_entries, val); + percpu_counter_add_batch(&dst->pcpuc_entries, val, + DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) diff --git a/net/core/dst.c b/net/core/dst.c index a263309df115e..1a9f84f8cde16 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -97,11 +97,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7b41d5d3575fd..d8944ae0171a9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2778,6 +2778,9 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); + if (entries > rt_max_size) + entries = dst_entries_get_slow(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && entries <= rt_max_size) goto out; From b4cfbeaebeb355dbaefb218470055de2e8a73020 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 16:53:07 -0800 Subject: [PATCH 24/26] ipv6: make ip6_rt_gc_expire an atomic_t commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db upstream. Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski [ 4.19: context adjustment in include/net/netns/ipv6.h ] Signed-off-by: Suraj Jitindar Singh Cc: # 4.19.x Signed-off-by: Greg Kroah-Hartman --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index f0e396ab9bec5..8f4d013fa05fc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,8 +72,8 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d8944ae0171a9..0f8d7786e8e87 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2775,6 +2775,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -2785,13 +2786,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -5343,7 +5344,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: From 95372b040ae689293c6863b90049f1af68410c8b Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Fri, 12 Jan 2024 16:53:08 -0800 Subject: [PATCH 25/26] ipv6: remove max_size check inline with ipv4 commit af6d10345ca76670c1b7c37799f0d5576ccef277 upstream. In ip6_dst_gc() replace: if (entries > gc_thresh) With: if (entries > ops->gc_thresh) Sending Ipv6 packets in a loop via a raw socket triggers an issue where a route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly consumes the Ipv6 max_size threshold which defaults to 4096 resulting in these warnings: [1] 99.187805] dst_alloc: 7728 callbacks suppressed [2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. . . [300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. When this happens the packet is dropped and sendto() gets a network is unreachable error: remaining pkt 200557 errno 101 remaining pkt 196462 errno 101 . . remaining pkt 126821 errno 101 Implement David Aherns suggestion to remove max_size check seeing that Ipv6 has a GC to manage memory usage. Ipv4 already does not check max_size. Here are some memory comparisons for Ipv4 vs Ipv6 with the patch: Test by running 5 instances of a program that sends UDP packets to a raw socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar program. Ipv4: Before test: MemFree: 29427108 kB Slab: 237612 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0 During test: MemFree: 29417608 kB Slab: 247712 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0 After test: MemFree: 29422308 kB Slab: 238104 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 Ipv6 with patch: Errno 101 errors are not observed anymore with the patch. Before test: MemFree: 29422308 kB Slab: 238104 kB ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 During Test: MemFree: 29431516 kB Slab: 240940 kB ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 After Test: MemFree: 29441816 kB Slab: 238132 kB ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0 Tested-by: Andrea Mayer Signed-off-by: Jon Maxwell Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Suraj Jitindar Singh Cc: # 4.19.x Signed-off-by: Greg Kroah-Hartman --- include/net/dst_ops.h | 2 +- net/core/dst.c | 8 ++------ net/ipv6/route.c | 13 +++++-------- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 88ff7bb2bb9bd..632086b2f644a 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -16,7 +16,7 @@ struct dst_ops { unsigned short family; unsigned int gc_thresh; - int (*gc)(struct dst_ops *ops); + void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); diff --git a/net/core/dst.c b/net/core/dst.c index 1a9f84f8cde16..1b1677683b97e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -99,12 +99,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0f8d7786e8e87..9dbc9c0cbc5a3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -92,7 +92,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -2767,11 +2767,10 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; @@ -2779,11 +2778,10 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) + if (entries > ops->gc_thresh) entries = dst_entries_get_slow(ops); - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -2793,7 +2791,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, @@ -5336,7 +5333,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; From abc2dd6a248d443c27888aee13cfefd94c3f7407 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 15 Jan 2024 18:23:44 +0100 Subject: [PATCH 26/26] Linux 4.19.305 Link: https://lore.kernel.org/r/20240113094205.025407355@linuxfoundation.org Tested-by: Pavel Machek (CIP) Tested-by: Linux Kernel Functional Testing Tested-by: Jon Hunter Tested-by: Harshit Mogalapalli Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fc7bc81130dc6..8887412d62c86 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 19 -SUBLEVEL = 304 +SUBLEVEL = 305 EXTRAVERSION = NAME = "People's Front"