From eb396669779280d08b7bb5ddb9e3411a3bb5f193 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 22 Oct 2024 19:08:16 -0400 Subject: [PATCH 01/23] selftests/mm temporary fix of hmm infinite loop jira SECO-170 In Rocky9 if you run ./run_vmtests.sh -t hmm it will fail and cause an infinite loop on ASSERTs in FIXTURE_TEARDOWN() This temporary fix is based on the discussion here https://patchwork.kernel.org/project/linux-kselftest/patch/26017fe3-5ad7-6946-57db-e5ec48063ceb@suse.cz/#25046055 We will investigate further kselftest updates that will resolve the root causes of this. Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- tools/testing/selftests/mm/hmm-tests.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index d2cfc9b494a0e..6f75c54564176 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -159,6 +159,10 @@ FIXTURE_TEARDOWN(hmm) { int ret = close(self->fd); + if (ret != 0) { + fprintf(stderr, "close returned (%d) fd is (%d)\n", ret, self->fd); + exit(1); + } ASSERT_EQ(ret, 0); self->fd = -1; } From cd52e48662bb7c0aa1fa2f260914bb4e5678431b Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Mon, 9 Jun 2025 15:49:43 -0400 Subject: [PATCH 02/23] tools: hv: Enable debug logs for hv_kvp_daemon jira LE-3207 feature tools_hv commit-author Shradha Gupta commit a9c0b33ef2306327dd2db02c6274107065ff9307 Allow the KVP daemon to log the KVP updates triggered in the VM with a new debug flag(-d). When the daemon is started with this flag, it logs updates and debug information in syslog with loglevel LOG_DEBUG. This information comes in handy for debugging issues where the key-value pairs for certain pools show mismatch/incorrect values. The distro-vendors can further consume these changes and modify the respective service files to redirect the logs to specific files as needed. Signed-off-by: Shradha Gupta Reviewed-by: Naman Jain Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com> (cherry picked from commit a9c0b33ef2306327dd2db02c6274107065ff9307) Signed-off-by: Jonathan Maple Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- tools/hv/hv_kvp_daemon.c | 64 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 1e6fd6ca513bd..0e0c997134ec6 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -77,6 +77,7 @@ enum { }; static int in_hand_shake; +static int debug; static char *os_name = ""; static char *os_major = ""; @@ -172,6 +173,20 @@ static void kvp_update_file(int pool) kvp_release_lock(pool); } +static void kvp_dump_initial_pools(int pool) +{ + int i; + + syslog(LOG_DEBUG, "===Start dumping the contents of pool %d ===\n", + pool); + + for (i = 0; i < kvp_file_info[pool].num_records; i++) + syslog(LOG_DEBUG, "pool: %d, %d/%d key=%s val=%s\n", + pool, i + 1, kvp_file_info[pool].num_records, + kvp_file_info[pool].records[i].key, + kvp_file_info[pool].records[i].value); +} + static void kvp_update_mem_state(int pool) { FILE *filep; @@ -259,6 +274,8 @@ static int kvp_file_init(void) return 1; kvp_file_info[i].num_records = 0; kvp_update_mem_state(i); + if (debug) + kvp_dump_initial_pools(i); } return 0; @@ -286,6 +303,9 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) * Found a match; just move the remaining * entries up. */ + if (debug) + syslog(LOG_DEBUG, "%s: deleting the KVP: pool=%d key=%s val=%s", + __func__, pool, record[i].key, record[i].value); if (i == (num_records - 1)) { kvp_file_info[pool].num_records--; kvp_update_file(pool); @@ -304,20 +324,36 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) kvp_update_file(pool); return 0; } + + if (debug) + syslog(LOG_DEBUG, "%s: could not delete KVP: pool=%d key=%s. Record not found", + __func__, pool, key); + return 1; } static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, const __u8 *value, int value_size) { - int i; - int num_records; struct kvp_record *record; + int num_records; int num_blocks; + int i; + + if (debug) + syslog(LOG_DEBUG, "%s: got a KVP: pool=%d key=%s val=%s", + __func__, pool, key, value); if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) || - (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) + (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) { + syslog(LOG_ERR, "%s: Too long key or value: key=%s, val=%s", + __func__, key, value); + + if (debug) + syslog(LOG_DEBUG, "%s: Too long key or value: pool=%d, key=%s, val=%s", + __func__, pool, key, value); return 1; + } /* * First update the in-memory state. @@ -337,6 +373,9 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, */ memcpy(record[i].value, value, value_size); kvp_update_file(pool); + if (debug) + syslog(LOG_DEBUG, "%s: updated: pool=%d key=%s val=%s", + __func__, pool, key, value); return 0; } @@ -348,8 +387,10 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, record = realloc(record, sizeof(struct kvp_record) * ENTRIES_PER_BLOCK * (num_blocks + 1)); - if (record == NULL) + if (!record) { + syslog(LOG_ERR, "%s: Memory alloc failure", __func__); return 1; + } kvp_file_info[pool].num_blocks++; } @@ -357,6 +398,11 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, memcpy(record[i].key, key, key_size); kvp_file_info[pool].records = record; kvp_file_info[pool].num_records++; + + if (debug) + syslog(LOG_DEBUG, "%s: added: pool=%d key=%s val=%s", + __func__, pool, key, value); + kvp_update_file(pool); return 0; } @@ -1355,6 +1401,7 @@ void print_usage(char *argv[]) fprintf(stderr, "Usage: %s [options]\n" "Options are:\n" " -n, --no-daemon stay in foreground, don't daemonize\n" + " -d, --debug Enable debug logs(syslog debug by default)\n" " -h, --help print this help\n", argv[0]); } @@ -1376,10 +1423,11 @@ int main(int argc, char *argv[]) static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"no-daemon", no_argument, 0, 'n' }, + {"debug", no_argument, 0, 'd' }, {0, 0, 0, 0 } }; - while ((opt = getopt_long(argc, argv, "hn", long_options, + while ((opt = getopt_long(argc, argv, "hnd", long_options, &long_index)) != -1) { switch (opt) { case 'n': @@ -1388,6 +1436,9 @@ int main(int argc, char *argv[]) case 'h': print_usage(argv); exit(0); + case 'd': + debug = 1; + break; default: print_usage(argv); exit(EXIT_FAILURE); @@ -1410,6 +1461,9 @@ int main(int argc, char *argv[]) */ kvp_get_domain_name(full_domain_name, sizeof(full_domain_name)); + if (debug) + syslog(LOG_INFO, "Logging debug info in syslog(debug)"); + if (kvp_file_init()) { syslog(LOG_ERR, "Failed to initialize the pools"); exit(EXIT_FAILURE); From c796640c222460961440e9b3af09bbd7bbf58707 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 13 Aug 2025 16:43:17 -0700 Subject: [PATCH 03/23] scsi: storvsc: Increase the timeouts to storvsc_timeout jira LE-3545 commit-author Dexuan Cui commit b2f966568faaad326de97481096d0f3dc0971c43 Currently storvsc_timeout is only used in storvsc_sdev_configure(), and 5s and 10s are used elsewhere. It turns out that rarely the 5s is not enough on Azure, so let's use storvsc_timeout everywhere. In case a timeout happens and storvsc_channel_init() returns an error, close the VMBus channel so that any host-to-guest messages in the channel's ringbuffer, which might come late, can be safely ignored. Add a "const" to storvsc_timeout. Cc: stable@kernel.org Signed-off-by: Dexuan Cui Link: https://lore.kernel.org/r/1749243459-10419-1-git-send-email-decui@microsoft.com Reviewed-by: Long Li Signed-off-by: Martin K. Petersen (cherry picked from commit b2f966568faaad326de97481096d0f3dc0971c43) Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/scsi/storvsc_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 9ede8e35b19ab..239f096a32288 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -356,7 +356,7 @@ MODULE_PARM_DESC(ring_avail_percent_lowater, /* * Timeout in seconds for all devices managed by this driver. */ -static int storvsc_timeout = 180; +static const int storvsc_timeout = 180; #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) static struct scsi_transport_template *fc_transport_template; @@ -762,7 +762,7 @@ static void handle_multichannel_storage(struct hv_device *device, int max_chns) return; } - t = wait_for_completion_timeout(&request->wait_event, 10*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) { dev_err(dev, "Failed to create sub-channel: timed out\n"); return; @@ -827,7 +827,7 @@ static int storvsc_execute_vstor_op(struct hv_device *device, if (ret != 0) return ret; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return -ETIMEDOUT; @@ -1345,6 +1345,8 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, return ret; ret = storvsc_channel_init(device, is_fc); + if (ret) + vmbus_close(device->channel); return ret; } @@ -1662,7 +1664,7 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) if (ret != 0) return FAILED; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return TIMEOUT_ERROR; From 1a5ba69628a632c82556d5997f1b17213c5995d0 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:10:26 +0000 Subject: [PATCH 04/23] Drivers: hv: Allow vmbus_sendpacket_mpb_desc() to create multiple ranges jira LE-3554 commit-author Michael Kelley commit 380b75d3078626aadd0817de61f3143f5db6e393 vmbus_sendpacket_mpb_desc() is currently used only by the storvsc driver and is hardcoded to create a single GPA range. To allow it to also be used by the netvsc driver to create multiple GPA ranges, no longer hardcode as having a single GPA range. Allow the calling driver to specify the rangecount in the supplied descriptor. Update the storvsc driver to reflect this new approach. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-2-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 380b75d3078626aadd0817de61f3143f5db6e393) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/hv/channel.c | 6 +++--- drivers/scsi/storvsc_drv.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fb8cd8469328e..4ffd5eaa78172 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1136,9 +1136,10 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); /* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1160,7 +1161,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 239f096a32288..c12240753d376 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1815,6 +1815,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) return SCSI_MLQUEUE_DEVICE_BUSY; } + payload->rangecount = 1; payload->range.len = length; payload->range.offset = offset_in_hvpg; From f24a715284ed131330da8a271b63d6f4c1f3d24c Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:11:12 +0000 Subject: [PATCH 05/23] hv_netvsc: Use vmbus_sendpacket_mpb_desc() to send VMBus messages jira LE-3554 commit-author Michael Kelley commit 4f98616b855cb0e3b5917918bb07b44728eb96ea netvsc currently uses vmbus_sendpacket_pagebuffer() to send VMBus messages. This function creates a series of GPA ranges, each of which contains a single PFN. However, if the rndis header in the VMBus message crosses a page boundary, the netvsc protocol with the host requires that both PFNs for the rndis header must be in a single "GPA range" data structure, which isn't possible with vmbus_sendpacket_pagebuffer(). As the first step in fixing this, add a new function netvsc_build_mpb_array() to build a VMBus message with multiple GPA ranges, each of which may contain multiple PFNs. Use vmbus_sendpacket_mpb_desc() to send this VMBus message to the host. There's no functional change since higher levels of netvsc don't maintain or propagate knowledge of contiguous PFNs. Based on its input, netvsc_build_mpb_array() still produces a separate GPA range for each PFN and the behavior is the same as with vmbus_sendpacket_pagebuffer(). But the groundwork is laid for a subsequent patch to provide the necessary grouping. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-3-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 4f98616b855cb0e3b5917918bb07b44728eb96ea) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/netvsc.c | 50 +++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9afb08dbc350a..74a84523c6849 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1054,6 +1054,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev, return 0; } +/* Build an "array" of mpb entries describing the data to be transferred + * over VMBus. After the desc header fields, each "array" entry is variable + * size, and each entry starts after the end of the previous entry. The + * "offset" and "len" fields for each entry imply the size of the entry. + * + * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V + * uses that granularity, even if the system page size of the guest is larger. + * Each entry in the input "pb" array must describe a contiguous range of + * guest physical memory so that the pfns are sequential if the range crosses + * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE. + */ +static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb, + u32 page_buffer_count, + struct vmbus_packet_mpb_array *desc, + u32 *desc_size) +{ + struct hv_mpb_array *mpb_entry = &desc->range; + int i, j; + + for (i = 0; i < page_buffer_count; i++) { + u32 offset = pb[i].offset; + u32 len = pb[i].len; + + mpb_entry->offset = offset; + mpb_entry->len = len; + + for (j = 0; j < HVPFN_UP(offset + len); j++) + mpb_entry->pfn_array[j] = pb[i].pfn + j; + + mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j]; + } + + desc->rangecount = page_buffer_count; + *desc_size = (char *)mpb_entry - (char *)desc; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -1096,6 +1132,9 @@ static inline int netvsc_send_pkt( packet->dma_range = NULL; if (packet->page_buf_cnt) { + struct vmbus_channel_packet_page_buffer desc; + u32 desc_size; + if (packet->cp_partial) pb += packet->rmsg_pgcnt; @@ -1105,11 +1144,12 @@ static inline int netvsc_send_pkt( goto exit; } - ret = vmbus_sendpacket_pagebuffer(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, sizeof(nvmsg), - req_id); - + netvsc_build_mpb_array(pb, packet->page_buf_cnt, + (struct vmbus_packet_mpb_array *)&desc, + &desc_size); + ret = vmbus_sendpacket_mpb_desc(out_channel, + (struct vmbus_packet_mpb_array *)&desc, + desc_size, &nvmsg, sizeof(nvmsg), req_id); if (ret) netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { From 73eadb4a261ba2be6b5d0cc9bf56c65d943c2ccc Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:11:37 +0000 Subject: [PATCH 06/23] hv_netvsc: Preserve contiguous PFN grouping in the page buffer array jira LE-3554 commit-author Michael Kelley commit 41a6328b2c55276f89ea3812069fd7521e348bbf Starting with commit dca5161f9bd0 ("hv_netvsc: Check status in SEND_RNDIS_PKT completion message") in the 6.3 kernel, the Linux driver for Hyper-V synthetic networking (netvsc) occasionally reports "nvsp_rndis_pkt_complete error status: 2".[1] This error indicates that Hyper-V has rejected a network packet transmit request from the guest, and the outgoing network packet is dropped. Higher level network protocols presumably recover and resend the packet so there is no functional error, but performance is slightly impacted. Commit dca5161f9bd0 is not the cause of the error -- it only added reporting of an error that was already happening without any notice. The error has presumably been present since the netvsc driver was originally introduced into Linux. The root cause of the problem is that the netvsc driver in Linux may send an incorrectly formatted VMBus message to Hyper-V when transmitting the network packet. The incorrect formatting occurs when the rndis header of the VMBus message crosses a page boundary due to how the Linux skb head memory is aligned. In such a case, two PFNs are required to describe the location of the rndis header, even though they are contiguous in guest physical address (GPA) space. Hyper-V requires that two rndis header PFNs be in a single "GPA range" data struture, but current netvsc code puts each PFN in its own GPA range, which Hyper-V rejects as an error. The incorrect formatting occurs only for larger packets that netvsc must transmit via a VMBus "GPA Direct" message. There's no problem when netvsc transmits a smaller packet by copying it into a pre- allocated send buffer slot because the pre-allocated slots don't have page crossing issues. After commit 14ad6ed30a10 ("net: allow small head cache usage with large MAX_SKB_FRAGS values") in the 6.14-rc4 kernel, the error occurs much more frequently in VMs with 16 or more vCPUs. It may occur every few seconds, or even more frequently, in an ssh session that outputs a lot of text. Commit 14ad6ed30a10 subtly changes how skb head memory is allocated, making it much more likely that the rndis header will cross a page boundary when the vCPU count is 16 or more. The changes in commit 14ad6ed30a10 are perfectly valid -- they just had the side effect of making the netvsc bug more prominent. Current code in init_page_array() creates a separate page buffer array entry for each PFN required to identify the data to be transmitted. Contiguous PFNs get separate entries in the page buffer array, and any information about contiguity is lost. Fix the core issue by having init_page_array() construct the page buffer array to represent contiguous ranges rather than individual pages. When these ranges are subsequently passed to netvsc_build_mpb_array(), it can build GPA ranges that contain multiple PFNs, as required to avoid the error "nvsp_rndis_pkt_complete error status: 2". If instead the network packet is sent by copying into a pre-allocated send buffer slot, the copy proceeds using the contiguous ranges rather than individual pages, but the result of the copying is the same. Also fix rndis_filter_send_request() to construct a contiguous range, since it has its own page buffer array. This change has a side benefit in CoCo VMs in that netvsc_dma_map() calls dma_map_single() on each contiguous range instead of on each page. This results in fewer calls to dma_map_single() but on larger chunks of memory, which should reduce contention on the swiotlb. Since the page buffer array now contains one entry for each contiguous range instead of for each individual page, the number of entries in the array can be reduced, saving 208 bytes of stack space in netvsc_xmit() when MAX_SKG_FRAGS has the default value of 17. [1] https://bugzilla.kernel.org/show_bug.cgi?id=217503 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217503 Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-4-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 41a6328b2c55276f89ea3812069fd7521e348bbf) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/hyperv_net.h | 12 ++++++ drivers/net/hyperv/netvsc_drv.c | 63 ++++++++----------------------- drivers/net/hyperv/rndis_filter.c | 24 +++--------- 3 files changed, 32 insertions(+), 67 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index fd23530e65b11..7fca7b5a5fb38 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -892,6 +892,18 @@ struct nvsp_message { sizeof(struct nvsp_message)) #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) +/* Maximum # of contiguous data ranges that can make up a trasmitted packet. + * Typically it's the max SKB fragments plus 2 for the rndis packet and the + * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may + * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries + * in a GPA direct packet sent to netvsp over VMBus. + */ +#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT +#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) +#else +#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT +#endif + /* Estimated requestor size: * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 86c3bdad0f1b5..edd02c8713de6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -325,43 +325,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, - struct hv_page_buffer *pb) -{ - int j = 0; - - hvpfn += offset >> HV_HYP_PAGE_SHIFT; - offset = offset & ~HV_HYP_PAGE_MASK; - - while (len > 0) { - unsigned long bytes; - - bytes = HV_HYP_PAGE_SIZE - offset; - if (bytes > len) - bytes = len; - pb[j].pfn = hvpfn; - pb[j].offset = offset; - pb[j].len = bytes; - - offset += bytes; - len -= bytes; - - if (offset == HV_HYP_PAGE_SIZE && len) { - hvpfn++; - offset = 0; - j++; - } - } - - return j + 1; -} - static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 slots_used = 0; - char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; @@ -370,28 +337,28 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_hvpfn(hdr), - offset_in_hvpage(hdr), - len, - &pb[slots_used]); + pb[0].offset = offset_in_hvpage(hdr); + pb[0].len = len; + pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = slots_used; + packet->rmsg_pgcnt = 1; - slots_used += fill_pg_buf(virt_to_hvpfn(data), - offset_in_hvpage(data), - skb_headlen(skb), - &pb[slots_used]); + pb[1].offset = offset_in_hvpage(skb->data); + pb[1].len = skb_headlen(skb); + pb[1].pfn = virt_to_hvpfn(skb->data); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; + struct hv_page_buffer *cur_pb = &pb[i + 2]; + u64 pfn = page_to_hvpfn(skb_frag_page(frag)); + u32 offset = skb_frag_off(frag); - slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), - skb_frag_off(frag), - skb_frag_size(frag), - &pb[slots_used]); + cur_pb->offset = offset_in_hvpage(offset); + cur_pb->len = skb_frag_size(frag); + cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); } - return slots_used; + return frags + 2; } static int count_skb_frag_slots(struct sk_buff *skb) @@ -482,7 +449,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) struct net_device *vf_netdev; u32 rndis_msg_size; u32 hash; - struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; + struct hv_page_buffer pb[MAX_DATA_RANGES]; /* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 82747dfacd70f..9e73959e61ee0 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { struct hv_netvsc_packet *packet; - struct hv_page_buffer page_buf[2]; - struct hv_page_buffer *pb = page_buf; + struct hv_page_buffer pb; int ret; /* Setup the packet to send it */ @@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->total_data_buflen = req->request_msg.msg_len; packet->page_buf_cnt = 1; - pb[0].pfn = virt_to_phys(&req->request_msg) >> - HV_HYP_PAGE_SHIFT; - pb[0].len = req->request_msg.msg_len; - pb[0].offset = offset_in_hvpage(&req->request_msg); - - /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { - packet->page_buf_cnt++; - pb[0].len = HV_HYP_PAGE_SIZE - - pb[0].offset; - pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> HV_HYP_PAGE_SHIFT; - pb[1].offset = 0; - pb[1].len = req->request_msg.msg_len - - pb[0].len; - } + pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; + pb.len = req->request_msg.msg_len; + pb.offset = offset_in_hvpage(&req->request_msg); trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); + ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); rcu_read_unlock_bh(); return ret; From 6ad56aab6384a1d7b3bae013b109ef1f711392d8 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:12:34 +0000 Subject: [PATCH 07/23] hv_netvsc: Remove rmsg_pgcnt jira LE-3554 commit-author Michael Kelley commit 5bbc644bbf4e97a05bc0cb052189004588ff8a09 init_page_array() now always creates a single page buffer array entry for the rndis message, even if the rndis message crosses a page boundary. As such, the number of page buffer array entries used for the rndis message must no longer be tracked -- it is always just 1. Remove the rmsg_pgcnt field and use "1" where the value is needed. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-5-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 5bbc644bbf4e97a05bc0cb052189004588ff8a09) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/net/hyperv/hyperv_net.h | 1 - drivers/net/hyperv/netvsc.c | 7 +++---- drivers/net/hyperv/netvsc_drv.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 7fca7b5a5fb38..763e19d73810f 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -157,7 +157,6 @@ struct hv_netvsc_packet { u8 cp_partial; /* partial copy into send buffer */ u8 rmsg_size; /* RNDIS header and PPI size */ - u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */ u8 page_buf_cnt; u16 q_idx; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 74a84523c6849..87ac2a5f18091 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -952,8 +952,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, + pend_size; int i; u32 padding = 0; - u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt; u32 remain; /* Add padding */ @@ -1136,7 +1135,7 @@ static inline int netvsc_send_pkt( u32 desc_size; if (packet->cp_partial) - pb += packet->rmsg_pgcnt; + pb++; ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); if (ret) { @@ -1298,7 +1297,7 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = section_index; if (packet->cp_partial) { - packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->page_buf_cnt--; packet->total_data_buflen = msd_len + packet->rmsg_size; } else { packet->page_buf_cnt = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index edd02c8713de6..0fe75c9ae4c22 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -342,7 +342,6 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, pb[0].len = len; pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = 1; pb[1].offset = offset_in_hvpage(skb->data); pb[1].len = skb_headlen(skb); From 66d96cdc0fcd95207173696528d991015bc76c3d Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 25 Aug 2025 18:13:08 +0000 Subject: [PATCH 08/23] Drivers: hv: vmbus: Remove vmbus_sendpacket_pagebuffer() jira LE-3554 commit-author Michael Kelley commit 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e With the netvsc driver changed to use vmbus_sendpacket_mpb_desc() instead of vmbus_sendpacket_pagebuffer(), the latter has no remaining callers. Remove it. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-6-mhklinux@outlook.com Signed-off-by: Jakub Kicinski (cherry picked from commit 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e) Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple Signed-off-by: Shreeya Patel Signed-off-by: Jonathan Maple --- drivers/hv/channel.c | 59 ------------------------------------------ include/linux/hyperv.h | 7 ----- 2 files changed, 66 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 4ffd5eaa78172..35f26fa1ffe76 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1076,65 +1076,6 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, } EXPORT_SYMBOL(vmbus_sendpacket); -/* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - /* * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 550ad87c38b6c..ccb2ef2fa1e93 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1226,13 +1226,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, From 02645e3dd5f2594f765d1f6d3ee4ebb72c647a63 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Thu, 29 Aug 2024 16:58:53 -0700 Subject: [PATCH 09/23] SUSE: patch: crypto-ecdh-implement-FIPS-PCT.patch Signed-off-by: Jeremy Allison Signed-off-by: Jonathan Maple --- crypto/ecdh.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/crypto/ecdh.c b/crypto/ecdh.c index fe8966511e9d7..af702cfefd22f 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ecc.h" struct ecdh_ctx { @@ -94,6 +95,36 @@ static int ecdh_compute_value(struct kpp_request *req) ctx->private_key, public_key); buf = public_key; nbytes = public_key_sz; + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of + * Pair-wise Consistency"): recompute the public key + * and check if the results match. + */ + if (fips_enabled) { + u64 *public_key_pct; + + if (ret < 0) + goto free_all; + + public_key_pct = kmalloc(public_key_sz, GFP_KERNEL); + if (!public_key_pct) { + ret = -ENOMEM; + goto free_all; + } + + ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits, + ctx->private_key, + public_key_pct); + if (ret < 0) { + kfree(public_key_pct); + goto free_all; + } + + if (memcmp(public_key, public_key_pct, public_key_sz)) + panic("ECDH PCT failed in FIPS mode"); + kfree(public_key_pct); + } } if (ret < 0) From 307c0d61612e4f4a8e390a2d12845b2a3f854331 Mon Sep 17 00:00:00 2001 From: Jason Rodriguez Date: Mon, 30 Sep 2024 12:57:14 -0400 Subject: [PATCH 10/23] crypto: essiv - Zeroize keys on exit in essiv_aead_setkey() In essiv_aead_setkey(), use the same logic as crypto_authenc_esn_setkey() to zeroize keys on exit. [Sultan: touched up commit message] Signed-off-by: Jason Rodriguez Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/essiv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crypto/essiv.c b/crypto/essiv.c index 8bcc5bdcb2a95..ec81bdea25631 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -114,13 +114,16 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); if (err) - return err; + goto out; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - return crypto_cipher_setkey(tctx->essiv_cipher, salt, - crypto_shash_digestsize(tctx->hash)); + err = crypto_cipher_setkey(tctx->essiv_cipher, salt, + crypto_shash_digestsize(tctx->hash)); +out: + memzero_explicit(&keys, sizeof(keys)); + return err; } static int essiv_aead_setauthsize(struct crypto_aead *tfm, From 14e675ec288a7e9ddab5aef659e10c91afeb538c Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Mon, 16 Jun 2025 13:34:27 -0700 Subject: [PATCH 11/23] crypto: jitter - replace LFSR with SHA3-256 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the kernel crypto API, the SHA3-256 algorithm is used as conditioning element to replace the LFSR in the Jitter RNG. All other parts of the Jitter RNG are unchanged. The application and use of the SHA-3 conditioning operation is identical to the user space Jitter RNG 3.4.0 by applying the following concept: - the Jitter RNG initializes a SHA-3 state which acts as the "entropy pool" when the Jitter RNG is allocated. - When a new time delta is obtained, it is inserted into the "entropy pool" with a SHA-3 update operation. Note, this operation in most of the cases is a simple memcpy() onto the SHA-3 stack. - To cause a true SHA-3 operation for each time delta operation, a second SHA-3 operation is performed hashing Jitter RNG status information. The final message digest is also inserted into the "entropy pool" with a SHA-3 update operation. Yet, this data is not considered to provide any entropy, but it shall stir the entropy pool. - To generate a random number, a SHA-3 final operation is performed to calculate a message digest followed by an immediate SHA-3 init to re-initialize the "entropy pool". The obtained message digest is one block of the Jitter RNG that is returned to the caller. Mathematically speaking, the random number generated by the Jitter RNG is: aux_t = SHA-3(Jitter RNG state data) Jitter RNG block = SHA-3(time_i || aux_i || time_(i-1) || aux_(i-1) || ... || time_(i-255) || aux_(i-255)) when assuming that the OSR = 1, i.e. the default value. This operation implies that the Jitter RNG has an output-blocksize of 256 bits instead of the 64 bits of the LFSR-based Jitter RNG that is replaced with this patch. The patch also replaces the varying number of invocations of the conditioning function with one fixed number of invocations. The use of the conditioning function consistent with the userspace Jitter RNG library version 3.4.0. The code is tested with a system that exhibited the least amount of entropy generated by the Jitter RNG: the SiFive Unmatched RISC-V system. The measured entropy rate is well above the heuristically implied entropy value of 1 bit of entropy per time delta. On all other tested systems, the measured entropy rate is even higher by orders of magnitude. The measurement was performed using updated tooling provided with the user space Jitter RNG library test framework. The performance of the Jitter RNG with this patch is about en par with the performance of the Jitter RNG without the patch. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Back-port of commit bb897c55042e9330bcf88b4b13cbdd6f9fabdd5e Author: Stephan Müller Date: Fri Apr 21 08:08:04 2023 +0200 Signed-off-by: Jeremy Allison Signed-off-by: Jonathan Maple --- crypto/Kconfig | 1 + crypto/jitterentropy-kcapi.c | 183 +++++++++++++++++++++++++++++++---- crypto/jitterentropy.c | 143 +++++++++------------------ crypto/jitterentropy.h | 10 +- 4 files changed, 218 insertions(+), 119 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index c0054b9f23cbb..17c113cd5fe52 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -2019,6 +2019,7 @@ config CRYPTO_ANSI_CPRNG tristate "Pseudo Random Number Generation for Cryptographic modules" select CRYPTO_AES select CRYPTO_RNG + select CRYPTO_SHA3 help This option enables the generic pseudo random number generator for cryptographic modules. Uses the Algorithm specified in diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index b9edfaa51b273..4b50cbc8a2faf 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * - * Copyright Stephan Mueller , 2015 + * Copyright Stephan Mueller , 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +37,8 @@ * DAMAGE. */ +#include +#include #include #include #include @@ -46,6 +48,8 @@ #include "jitterentropy.h" +#define JENT_CONDITIONING_HASH "sha3-256-generic" + /*************************************************************************** * Helper function ***************************************************************************/ @@ -60,11 +64,6 @@ void jent_zfree(void *ptr) kfree_sensitive(ptr); } -void jent_memcpy(void *dest, const void *src, unsigned int n) -{ - memcpy(dest, src, n); -} - /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time @@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out) *out = tmp; } +int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); + u8 intermediary[SHA3_256_DIGEST_SIZE]; + __u64 j = 0; + int ret; + + desc->tfm = hash_state_desc->tfm; + + if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { + pr_warn_ratelimited("Unexpected digest size\n"); + return -EINVAL; + } + + /* + * This loop fills a buffer which is injected into the entropy pool. + * The main reason for this loop is to execute something over which we + * can perform a timing measurement. The injection of the resulting + * data into the pool is performed to ensure the result is used and + * the compiler cannot optimize the loop away in case the result is not + * used at all. Yet that data is considered "additional information" + * considering the terminology from SP800-90A without any entropy. + * + * Note, it does not matter which or how much data you inject, we are + * interested in one Keccack1600 compression operation performed with + * the crypto_shash_final. + */ + for (j = 0; j < hash_loop_cnt; j++) { + ret = crypto_shash_init(desc) ?: + crypto_shash_update(desc, intermediary, + sizeof(intermediary)) ?: + crypto_shash_finup(desc, addtl, addtl_len, intermediary); + if (ret) + goto err; + } + + /* + * Inject the data from the previous loop into the pool. This data is + * not considered to contain any entropy, but it stirs the pool a bit. + */ + ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); + if (ret) + goto err; + + /* + * Insert the time stamp into the hash context representing the pool. + * + * If the time stamp is stuck, do not finally insert the value into the + * entropy pool. Although this operation should not do any harm even + * when the time stamp has no entropy, SP800-90B requires that any + * conditioning operation to have an identical amount of input data + * according to section 3.1.5. + */ + if (!stuck) { + ret = crypto_shash_update(hash_state_desc, (u8 *)&time, + sizeof(__u64)); + } + +err: + shash_desc_zero(desc); + memzero_explicit(intermediary, sizeof(intermediary)); + + return ret; +} + +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + u8 jent_block[SHA3_256_DIGEST_SIZE]; + /* Obtain data from entropy pool and re-initialize it */ + int ret = crypto_shash_final(hash_state_desc, jent_block) ?: + crypto_shash_init(hash_state_desc) ?: + crypto_shash_update(hash_state_desc, jent_block, + sizeof(jent_block)); + + if (!ret && dst_len) + memcpy(dst, jent_block, dst_len); + + memzero_explicit(jent_block, sizeof(jent_block)); + return ret; +} + /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ @@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out) struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; + struct crypto_shash *tfm; + struct shash_desc *sdesc; }; -static int jent_kcapi_init(struct crypto_tfm *tfm) +static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); - int ret = 0; - rng->entropy_collector = jent_entropy_collector_alloc(1, 0); - if (!rng->entropy_collector) - ret = -ENOMEM; + spin_lock(&rng->jent_lock); - spin_lock_init(&rng->jent_lock); - return ret; -} + if (rng->sdesc) { + shash_desc_zero(rng->sdesc); + kfree(rng->sdesc); + } + rng->sdesc = NULL; -static void jent_kcapi_cleanup(struct crypto_tfm *tfm) -{ - struct jitterentropy *rng = crypto_tfm_ctx(tfm); + if (rng->tfm) + crypto_free_shash(rng->tfm); + rng->tfm = NULL; - spin_lock(&rng->jent_lock); if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } +static int jent_kcapi_init(struct crypto_tfm *tfm) +{ + struct jitterentropy *rng = crypto_tfm_ctx(tfm); + struct crypto_shash *hash; + struct shash_desc *sdesc; + int size, ret = 0; + + spin_lock_init(&rng->jent_lock); + + /* + * Use SHA3-256 as conditioner. We allocate only the generic + * implementation as we are not interested in high-performance. The + * execution time of the SHA3 operation is measured and adds to the + * Jitter RNG's unpredictable behavior. If we have a slower hash + * implementation, the execution timing variations are larger. When + * using a fast implementation, we would need to call it more often + * as its variations are lower. + */ + hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(hash)) { + pr_err("Cannot allocate conditioning digest\n"); + return PTR_ERR(hash); + } + rng->tfm = hash; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); + sdesc = kmalloc(size, GFP_KERNEL); + if (!sdesc) { + ret = -ENOMEM; + goto err; + } + + sdesc->tfm = hash; + crypto_shash_init(sdesc); + rng->sdesc = sdesc; + + rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc); + if (!rng->entropy_collector) { + ret = -ENOMEM; + goto err; + } + + spin_lock_init(&rng->jent_lock); + return 0; + +err: + jent_kcapi_cleanup(tfm); + return ret; +} + static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) @@ -180,15 +314,24 @@ static struct rng_alg jent_alg = { .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, - } }; static int __init jent_mod_init(void) { + SHASH_DESC_ON_STACK(desc, tfm); + struct crypto_shash *tfm; int ret = 0; - ret = jent_entropy_init(); + tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + desc->tfm = tfm; + crypto_shash_init(desc); + ret = jent_entropy_init(desc); + shash_desc_zero(desc); + crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 227cedfa4f0ae..5b224d3d7442e 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * - * Copyright Stephan Mueller , 2015 - 2020 + * Copyright Stephan Mueller , 2015 - 2023 * * Design * ====== @@ -57,21 +57,22 @@ typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; +typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { + /* SHA3-256 is used as conditioner */ +#define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ - __u64 data; /* SENSITIVE Actual random number */ - __u64 old_data; /* SENSITIVE Previous random number */ - __u64 prev_time; /* SENSITIVE Previous time stamp */ -#define DATA_SIZE_BITS ((sizeof(__u64)) * 8) - __u64 last_delta; /* SENSITIVE stuck test */ - __s64 last_delta2; /* SENSITIVE stuck test */ - unsigned int osr; /* Oversample rate */ + void *hash_state; /* SENSITIVE hash state entropy pool */ + __u64 prev_time; /* SENSITIVE Previous time stamp */ + __u64 last_delta; /* SENSITIVE stuck test */ + __s64 last_delta2; /* SENSITIVE stuck test */ + unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_BLOCKS 64 #define JENT_MEMORY_BLOCKSIZE 32 #define JENT_MEMORY_ACCESSLOOPS 128 @@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec) * an entropy collection. * * Input: - * @ec entropy collector struct -- may be NULL * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ -static __u64 jent_loop_shuffle(struct rand_data *ec, - unsigned int bits, unsigned int min) +static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; @@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, unsigned int mask = (1<data; + /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. @@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, * execution time jitter * * This function injects the individual bits of the time value into the - * entropy pool using an LFSR. + * entropy pool using a hash. * - * The code is deliberately inefficient with respect to the bit shifting - * and shall stay that way. This function is the root cause why the code - * shall be compiled without optimization. This function not only acts as - * folding operation, but this function's execution is used to measure - * the CPU execution time jitter. Any change to the loop in this function - * implies that careful retesting must be done. - * - * @ec [in] entropy collector struct - * @time [in] time stamp to be injected - * @loop_cnt [in] if a value not equal to 0 is set, use the given value as - * number of loops to perform the folding - * @stuck [in] Is the time stamp identified as stuck? + * ec [in] entropy collector + * time [in] time stamp to be injected + * stuck [in] Is the time stamp identified as stuck? * * Output: - * updated ec->data - * - * @return Number of loops the folding operation is performed + * updated hash context in the entropy collector or error code */ -static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt, - int stuck) +static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { - unsigned int i; - __u64 j = 0; - __u64 new = 0; -#define MAX_FOLD_LOOP_BIT 4 -#define MIN_FOLD_LOOP_BIT 0 - __u64 fold_loop_cnt = - jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT); - - /* - * testing purposes -- allow test app to set the counter, not - * needed during runtime - */ - if (loop_cnt) - fold_loop_cnt = loop_cnt; - for (j = 0; j < fold_loop_cnt; j++) { - new = ec->data; - for (i = 1; (DATA_SIZE_BITS) >= i; i++) { - __u64 tmp = time << (DATA_SIZE_BITS - i); - - tmp = tmp >> (DATA_SIZE_BITS - 1); - - /* - * Fibonacci LSFR with polynomial of - * x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is - * primitive according to - * http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf - * (the shift values are the polynomial values minus one - * due to counting bits from 0 to 63). As the current - * position is always the LSB, the polynomial only needs - * to shift data in from the left without wrap. - */ - tmp ^= ((new >> 63) & 1); - tmp ^= ((new >> 60) & 1); - tmp ^= ((new >> 55) & 1); - tmp ^= ((new >> 30) & 1); - tmp ^= ((new >> 27) & 1); - tmp ^= ((new >> 22) & 1); - new <<= 1; - new ^= tmp; - } - } - - /* - * If the time stamp is stuck, do not finally insert the value into - * the entropy pool. Although this operation should not do any harm - * even when the time stamp has no entropy, SP800-90B requires that - * any conditioning operation (SP800-90B considers the LFSR to be a - * conditioning operation) to have an identical amount of input - * data according to section 3.1.5. - */ - if (!stuck) - ec->data = new; +#define SHA3_HASH_LOOP (1<<3) + struct { + int rct_count; + unsigned int apt_observations; + unsigned int apt_count; + unsigned int apt_base; + } addtl = { + ec->rct_count, + ec->apt_observations, + ec->apt_count, + ec->apt_base + }; + + return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), + SHA3_HASH_LOOP, stuck); } /* @@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = - jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); + jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; @@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec) stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ - jent_lfsr_time(ec, current_delta, 0, stuck); + if (jent_condition_data(ec, current_delta, stuck)) + stuck = 1; return stuck; } /* * Generator of one 64 bit random number - * Function fills rand_data->data + * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ @@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec) * @return 0 when request is fulfilled or an error * * The following error codes can occur: - * -1 entropy_collector is NULL + * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ @@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, * Perform startup health tests and return permanent * error if it fails. */ - if (jent_entropy_init()) + if (jent_entropy_init(ec->hash_state)) return -3; return -2; @@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, tocopy = (DATA_SIZE_BITS / 8); else tocopy = len; - jent_memcpy(p, &ec->data, tocopy); + if (jent_read_random_block(ec->hash_state, p, tocopy)) + return -1; len -= tocopy; p += tocopy; @@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags) + unsigned int flags, + void *hash_state) { struct rand_data *entropy_collector; @@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, osr = 1; /* minimum sampling rate is 1 */ entropy_collector->osr = osr; + entropy_collector->hash_state = hash_state; + /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); @@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector) jent_zfree(entropy_collector); } -int jent_entropy_init(void) +int jent_entropy_init(void *hash_state) { int i; __u64 delta_sum = 0; @@ -681,6 +631,7 @@ int jent_entropy_init(void) /* Required for RCT */ ec.osr = 1; + ec.hash_state = hash_state; /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These @@ -718,7 +669,7 @@ int jent_entropy_init(void) /* Invoke core entropy collection logic */ jent_get_nstime(&time); ec.prev_time = time; - jent_lfsr_time(&ec, time, 0, 0); + jent_condition_data(&ec, time, 0); jent_get_nstime(&time2); /* test whether timer works */ diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h index 5cc583f6bc6b8..b3890ff26a023 100644 --- a/crypto/jitterentropy.h +++ b/crypto/jitterentropy.h @@ -2,14 +2,18 @@ extern void *jent_zalloc(unsigned int len); extern void jent_zfree(void *ptr); -extern void jent_memcpy(void *dest, const void *src, unsigned int n); extern void jent_get_nstime(__u64 *out); +extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck); +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len); struct rand_data; -extern int jent_entropy_init(void); +extern int jent_entropy_init(void *hash_state); extern int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len); extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags); + unsigned int flags, + void *hash_state); extern void jent_entropy_collector_free(struct rand_data *entropy_collector); From 0eb3b02ae62fb8e47c30295ef4e8340d51400738 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Wed, 4 Sep 2024 10:24:07 -0700 Subject: [PATCH 12/23] crypto: aead,cipher - zeroize key buffer after use I.G 9.7.B for FIPS 140-3 specifies that variables temporarily holding cryptographic information should be zeroized once they are no longer needed. Accomplish this by using kfree_sensitive for buffers that previously held the private key. Signed-off-by: Hailey Mothershead Signed-off-by: Herbert Xu Back-ported from commit 23e4099bdc3c8381992f9eb975c79196d6755210 Author: Hailey Mothershead Date: Mon Apr 15 22:19:15 2024 +0000 Signed-off-by: Jeremy Allison Signed-off-by: Jonathan Maple --- crypto/aead.c | 3 +-- crypto/cipher.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/aead.c b/crypto/aead.c index 16991095270d2..c4ece86c45bc4 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -35,8 +35,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/cipher.c b/crypto/cipher.c index b47141ed4a9f3..395f0c2fbb9ff 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -34,8 +34,7 @@ static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } From 02fe271a7c44fcbece24158858c883ebd700a727 Mon Sep 17 00:00:00 2001 From: Joachim Vandersmissen Date: Thu, 28 Mar 2024 11:24:30 -0500 Subject: [PATCH 13/23] crypto: ecdh - explicitly zeroize private_key private_key is overwritten with the key parameter passed in by the caller (if present), or alternatively a newly generated private key. However, it is possible that the caller provides a key (or the newly generated key) which is shorter than the previous key. In that scenario, some key material from the previous key would not be overwritten. The easiest solution is to explicitly zeroize the entire private_key array first. Note that this patch slightly changes the behavior of this function: previously, if the ecc_gen_privkey failed, the old private_key would remain. Now, the private_key is always zeroized. This behavior is consistent with the case where params.key is set and ecc_is_key_valid fails. Signed-off-by: Joachim Vandersmissen Signed-off-by: Herbert Xu Signed-off-by: Jonathan Maple --- crypto/ecdh.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/ecdh.c b/crypto/ecdh.c index af702cfefd22f..85c64f1a40df2 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -34,6 +34,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, params.key_size > sizeof(u64) * ctx->ndigits) return -EINVAL; + memset(ctx->private_key, 0, sizeof(ctx->private_key)); + if (!params.key || !params.key_size) return ecc_gen_privkey(ctx->curve_id, ctx->ndigits, ctx->private_key); From 852f3d35dd31be736d7ae23e0040c7284f08fff9 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 14 Dec 2023 11:08:34 +0800 Subject: [PATCH 14/23] crypto: lib/mpi - Fix unexpected pointer access in mpi_ec_init [ Upstream commit ba3c5574203034781ac4231acf117da917efcd2a ] When the mpi_ec_ctx structure is initialized, some fields are not cleared, causing a crash when referencing the field when the structure was released. Initially, this issue was ignored because memory for mpi_ec_ctx is allocated with the __GFP_ZERO flag. For example, this error will be triggered when calculating the Za value for SM2 separately. Fixes: d58bb7e55a8a ("lib/mpi: Introduce ec implementation to MPI library") Cc: stable@vger.kernel.org # v6.5 Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Jonathan Maple --- lib/mpi/ec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/mpi/ec.c b/lib/mpi/ec.c index 40f5908e57a4f..e16dca1e23d52 100644 --- a/lib/mpi/ec.c +++ b/lib/mpi/ec.c @@ -584,6 +584,9 @@ void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model, ctx->a = mpi_copy(a); ctx->b = mpi_copy(b); + ctx->d = NULL; + ctx->t.two_inv_p = NULL; + ctx->t.p_barrett = use_barrett > 0 ? mpi_barrett_init(ctx->p, 0) : NULL; mpi_ec_get_reset(ctx); From 2a3f6189a5390d06ddf432f1d786567b35dc8e7f Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 22 Jul 2025 15:47:52 -0700 Subject: [PATCH 15/23] crypto: Kconfig - Make CRYPTO_FIPS depend on the DRBG being built-in When FIPS mode is enabled (via fips=1), there is an absolute need for the DRBG to be available. This is at odds with the fact that the DRBG can be built as a module when in FIPS mode, leaving critical RNG functionality at the whims of userspace. Userspace could simply rmmod the DRBG module, or not provide it at all and thus a different stdrng algorithm could be used without anyone noticing. Additionally, when running a FIPS-enabled userspace, modprobe itself may perform a getrandom() syscall _before_ loading a given module. As a result, there's a possible deadlock scenario where the RNG core (crypto/rng.c) initializes _before_ the DRBG, thereby installing its getrandom() override without having an stdrng algorithm available. Then, when userspace calls getrandom() which redirects to the override in crypto/rng.c, crypto_alloc_rng("stdrng") invokes the UMH (modprobe) to load the DRBG (which is aliased to stdrng). And *then* that modprobe invocation gets stuck at getrandom() because there's no stdrng algorithm available! There are too many risks that come with allowing the DRBG and RNG core to be modular for FIPS mode. Therefore, make CRYPTO_FIPS require the DRBG to be built-in, which in turn makes the DRBG require the RNG core to be built-in. That way, it's guaranteed for these drivers to be built-in when running in FIPS mode. Also clean up the CRYPTO_FIPS option name and remove the CRYPTO_ANSI_CPRNG dependency since it's obsolete for FIPS now. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 17c113cd5fe52..c5ef8cac60efa 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -23,12 +23,12 @@ if CRYPTO comment "Crypto core or helper" config CRYPTO_FIPS - bool "FIPS 200 compliance" - depends on (CRYPTO_ANSI_CPRNG || CRYPTO_DRBG) && !CRYPTO_MANAGER_DISABLE_TESTS + bool "FIPS compliance" + depends on CRYPTO_DRBG=y && !CRYPTO_MANAGER_DISABLE_TESTS depends on (MODULE_SIG || !MODULES) help This option enables the fips boot option which is - required if you want the system to operate in a FIPS 200 + required if you want the system to operate in a FIPS certification. You should say no unless you know what this is. From 1b32de3ec7d69177cb9f67604e748390998bf333 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 1 Aug 2025 15:19:15 -0700 Subject: [PATCH 16/23] random: Restrict extrng registration to init time It is technically a risk to permit extrng registration by modules after kernel init completes. Since there is only one user of the extrng interface and it is imperative that it is the _only_ registered extrng for FIPS compliance, restrict the extrng registration interface to only permit registration during kernel init and only from built-in drivers. This also eliminates the risks associated with the extrng interface itself being designed to solely accommodate a single registration, which would therefore permit the registered extrng to be overridden or even removed by an unrelated module. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/rng.c | 9 +----- drivers/char/random.c | 70 ++++++++---------------------------------- include/linux/random.h | 16 +++++----- 3 files changed, 20 insertions(+), 75 deletions(-) diff --git a/crypto/rng.c b/crypto/rng.c index c650678106a7f..a076f0878eb37 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -322,8 +322,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) } static const struct random_extrng crypto_devrandom_rng = { - .extrng_read_iter = crypto_devrandom_read_iter, - .owner = THIS_MODULE, + .extrng_read_iter = crypto_devrandom_read_iter }; static int __init crypto_rng_init(void) @@ -333,13 +332,7 @@ static int __init crypto_rng_init(void) return 0; } -static void __exit crypto_rng_exit(void) -{ - random_unregister_extrng(); -} - late_initcall(crypto_rng_init); -module_exit(crypto_rng_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Random Number Generator"); diff --git a/drivers/char/random.c b/drivers/char/random.c index 317a0b15dc34c..5fe3118a3c278 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -314,7 +313,7 @@ static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE], /* * Hook for external RNG. */ -static const struct random_extrng __rcu *extrng; +static const struct random_extrng *extrng __ro_after_init; /* * This function returns a ChaCha state that you may use for generating @@ -966,18 +965,12 @@ void __init add_bootloader_randomness(const void *buf, size_t len) credit_init_bits(len * 8); } -void random_register_extrng(const struct random_extrng *rng) +void __init random_register_extrng(const struct random_extrng *rng) { - rcu_assign_pointer(extrng, rng); + /* Don't allow the registered extrng to be overridden */ + BUG_ON(extrng); + extrng = rng; } -EXPORT_SYMBOL_GPL(random_register_extrng); - -void random_unregister_extrng(void) -{ - RCU_INIT_POINTER(extrng, NULL); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(random_unregister_extrng); #if IS_ENABLED(CONFIG_VMGENID) static BLOCKING_NOTIFIER_HEAD(vmfork_chain); @@ -1386,7 +1379,6 @@ static void __cold try_to_generate_entropy(void) SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags) { - const struct random_extrng *rng; struct iov_iter iter; struct iovec iov; int ret; @@ -1404,19 +1396,11 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags if (len > INT_MAX) len = INT_MAX; - rcu_read_lock(); - rng = rcu_dereference(extrng); - if (rng && !try_module_get(rng->owner)) - rng = NULL; - rcu_read_unlock(); - - if (rng) { + if (extrng) { ret = import_single_range(READ, ubuf, len, &iov, &iter); if (unlikely(ret)) return ret; - ret = rng->extrng_read_iter(&iter, !!(flags & GRND_RANDOM)); - module_put(rng->owner); - return ret; + return extrng->extrng_read_iter(&iter, !!(flags & GRND_RANDOM)); } if (!crng_ready() && !(flags & GRND_INSECURE)) { @@ -1589,52 +1573,24 @@ static int random_fasync(int fd, struct file *filp, int on) static int random_open(struct inode *inode, struct file *filp) { - const struct random_extrng *rng; - - rcu_read_lock(); - rng = rcu_dereference(extrng); - if (rng && !try_module_get(rng->owner)) - rng = NULL; - rcu_read_unlock(); - - if (!rng) - return 0; - - filp->f_op = &extrng_random_fops; - filp->private_data = rng->owner; + if (extrng) + filp->f_op = &extrng_random_fops; return 0; } static int urandom_open(struct inode *inode, struct file *filp) { - const struct random_extrng *rng; + if (extrng) + filp->f_op = &extrng_urandom_fops; - rcu_read_lock(); - rng = rcu_dereference(extrng); - if (rng && !try_module_get(rng->owner)) - rng = NULL; - rcu_read_unlock(); - - if (!rng) - return 0; - - filp->f_op = &extrng_urandom_fops; - filp->private_data = rng->owner; - - return 0; -} - -static int extrng_release(struct inode *inode, struct file *filp) -{ - module_put(filp->private_data); return 0; } static ssize_t extrng_read_iter(struct kiocb *kiocb, struct iov_iter *iter) { - return rcu_dereference_raw(extrng)->extrng_read_iter(iter, false); + return extrng->extrng_read_iter(iter, false); } const struct file_operations random_fops = { @@ -1670,7 +1626,6 @@ static const struct file_operations extrng_random_fops = { .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, - .release = extrng_release, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; @@ -1682,7 +1637,6 @@ static const struct file_operations extrng_urandom_fops = { .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, - .release = extrng_release, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/include/linux/random.h b/include/linux/random.h index d4cabe51e9434..9bde794ec8d93 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,12 +9,6 @@ #include -struct iov_iter; -struct random_extrng { - ssize_t (*extrng_read_iter)(struct iov_iter *, bool reseed); - struct module *owner; -}; - struct notifier_block; void add_device_randomness(const void *buf, size_t len); @@ -42,9 +36,6 @@ static inline int register_random_vmfork_notifier(struct notifier_block *nb) { r static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; } #endif -void random_register_extrng(const struct random_extrng *rng); -void random_unregister_extrng(void); - void get_random_bytes(void *buf, size_t len); u8 get_random_u8(void); u16 get_random_u16(void); @@ -173,6 +164,13 @@ int random_online_cpu(unsigned int cpu); #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; + +struct iov_iter; +struct random_extrng { + ssize_t (*extrng_read_iter)(struct iov_iter *iter, bool reseed); +}; + +void __init random_register_extrng(const struct random_extrng *rng); #endif #endif /* _LINUX_RANDOM_H */ From 35f544c1c633a0d4612e6f99ba809a6f34b5095e Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 24 Jun 2025 15:16:34 -0700 Subject: [PATCH 17/23] crypto: rng - Convert crypto_default_rng_refcnt into an unsigned int There is no reason this refcount should be a signed int. Convert it to an unsigned int, thereby also making it less likely to ever overflow. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/rng.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/rng.c b/crypto/rng.c index a076f0878eb37..108404df25be5 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -31,7 +31,7 @@ static struct crypto_rng *crypto_reseed_rng; static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); -static int crypto_default_rng_refcnt; +static unsigned int crypto_default_rng_refcnt; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { @@ -164,7 +164,7 @@ void crypto_put_default_rng(void) EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) -static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp, +static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp, struct mutex *lock) { int err = -EBUSY; From fd75e5d3bd5ab660893439a795fe956e51309a9c Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 25 Jul 2025 14:32:37 -0700 Subject: [PATCH 18/23] crypto: rng - Only allow the DRBG to register as "stdrng" in FIPS mode In FIPS mode, the DRBG must take precedence over all stdrng algorithms. The only problem standing in the way of this is that a different stdrng algorithm could get registered and utilized before the DRBG is registered, and since crypto_alloc_rng() only allocates an stdrng algorithm when there's no existing allocation, this means that it's possible for the wrong stdrng algorithm to remain in use indefinitely. This issue is also often impossible to observe from userspace; an RNG other than the DRBG could be used somewhere in the kernel and userspace would be none the wiser. To ensure this can never happen, only allow stdrng instances from the DRBG to be registered when running in FIPS mode. This works since the previous commit forces the DRBG to be built into the kernel when CONFIG_CRYPTO_FIPS is enabled, so the DRBG's presence is guaranteed when fips_enabled is true. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/rng.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crypto/rng.c b/crypto/rng.c index 108404df25be5..3bb27a8ec7241 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -32,6 +32,7 @@ static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static unsigned int crypto_default_rng_refcnt; +static bool drbg_registered __ro_after_init; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { @@ -201,6 +202,19 @@ int crypto_register_rng(struct rng_alg *alg) if (alg->seedsize > PAGE_SIZE / 8) return -EINVAL; + /* + * In FIPS mode, the DRBG must take precedence over all other "stdrng" + * algorithms. Therefore, forbid registration of a non-DRBG stdrng in + * FIPS mode. All of the DRBG's driver names are prefixed with "drbg_". + * This also stops new stdrng instances from getting registered after it + * is known that the DRBG is registered, so a new module can't come in + * and pretend to be the DRBG. And when CONFIG_CRYPTO_FIPS is enabled, + * the DRBG is built into the kernel directly; it can't be a module. + */ + if (fips_enabled && !strcmp(base->cra_name, "stdrng") && + (drbg_registered || strncmp(base->cra_driver_name, "drbg_", 5))) + return -EINVAL; + base->cra_type = &crypto_rng_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_RNG; @@ -225,6 +239,18 @@ int crypto_register_rngs(struct rng_alg *algs, int count) goto err; } + /* + * Track when the DRBG is registered in FIPS mode. The DRBG calls + * crypto_register_rngs() to register its stdrng instances, and since + * crypto_register_rng() only allows stdrng instances from the DRBG in + * FIPS mode, a successful stdrng registration means it was the DRBG. + * Just check the first alg in the array to see if it's called "stdrng", + * since all of the DRBG's algorithms are named "stdrng". Once + * drbg_registered is set to true, this if-statement is always false. + */ + if (fips_enabled && !strcmp(algs->base.cra_name, "stdrng")) + drbg_registered = true; + return 0; err: From 2127d419261f9007b4e9086821b3f95e84d54839 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 11 Jun 2025 14:16:35 -0700 Subject: [PATCH 19/23] crypto: drbg - Align buffers to at least a cache line None of the ciphers used by the DRBG have an alignment requirement; thus, they all return 0 from .crypto_init, resulting in inconsistent alignment across all buffers. Align all buffers to at least a cache line to improve performance. This is especially useful when multiple DRBG instances are used, since it prevents false sharing of cache lines between the different instances. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/drbg.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crypto/drbg.c b/crypto/drbg.c index accf425de57f7..d14cc09b5d399 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1283,6 +1283,12 @@ static inline int drbg_alloc_state(struct drbg_state *drbg) if (ret < 0) goto err; + /* + * Align to at least a cache line for better performance. This also + * prevents false sharing of cache lines between different instances. + */ + ret = max(ret, L1_CACHE_BYTES - 1); + drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL); if (!drbg->Vbuf) { ret = -ENOMEM; From 570e91c888481bd724b86ebe35244edb7f86251a Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 24 Jun 2025 15:31:00 -0700 Subject: [PATCH 20/23] crypto: rng - Fix priority inversions due to mutex locks Since crypto_devrandom_read_iter() is invoked directly by user tasks and is accessible by every task in the system, there are glaring priority inversions on crypto_reseed_rng_lock and crypto_default_rng_lock. Tasks of arbitrary scheduling priority access crypto_devrandom_read_iter(). When a low-priority task owns one of the mutex locks, higher-priority tasks waiting on that mutex lock are stalled until the low-priority task is done. Fix the priority inversions by converting the mutex locks into rt_mutex locks which have PI support. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/rng.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/crypto/rng.c b/crypto/rng.c index 3bb27a8ec7241..879a3f42a9d49 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -14,8 +14,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -26,9 +26,9 @@ #include "internal.h" -static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock); +static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock); static struct crypto_rng *crypto_reseed_rng; -static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock); +static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static unsigned int crypto_default_rng_refcnt; @@ -146,11 +146,11 @@ int crypto_get_default_rng(void) { int err; - mutex_lock(&crypto_default_rng_lock); + rt_mutex_lock(&crypto_default_rng_lock); err = crypto_get_rng(&crypto_default_rng); if (!err) crypto_default_rng_refcnt++; - mutex_unlock(&crypto_default_rng_lock); + rt_mutex_unlock(&crypto_default_rng_lock); return err; } @@ -158,19 +158,19 @@ EXPORT_SYMBOL_GPL(crypto_get_default_rng); void crypto_put_default_rng(void) { - mutex_lock(&crypto_default_rng_lock); + rt_mutex_lock(&crypto_default_rng_lock); crypto_default_rng_refcnt--; - mutex_unlock(&crypto_default_rng_lock); + rt_mutex_unlock(&crypto_default_rng_lock); } EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp, - struct mutex *lock) + struct rt_mutex *lock) { int err = -EBUSY; - mutex_lock(lock); + rt_mutex_lock(lock); if (refcntp && *refcntp) goto out; @@ -180,7 +180,7 @@ static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp, err = 0; out: - mutex_unlock(lock); + rt_mutex_unlock(lock); return err; } @@ -290,7 +290,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) * a separate mutex (drbg->drbg_mutex) around the * reseed-and-generate operation. */ - mutex_lock(&crypto_reseed_rng_lock); + rt_mutex_lock(&crypto_reseed_rng_lock); /* If crypto_default_rng is not set, it will be seeded * at creation in __crypto_get_default_rng and thus no @@ -301,7 +301,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) ret = crypto_get_rng(&crypto_reseed_rng); if (ret) { - mutex_unlock(&crypto_reseed_rng_lock); + rt_mutex_unlock(&crypto_reseed_rng_lock); return ret; } @@ -340,7 +340,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) } if (reseed) - mutex_unlock(&crypto_reseed_rng_lock); + rt_mutex_unlock(&crypto_reseed_rng_lock); else crypto_put_default_rng(); memzero_explicit(tmp, sizeof(tmp)); From 7c551fb802f3829638218d778eae30d9418b41b4 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 18 Jun 2025 23:42:08 -0700 Subject: [PATCH 21/23] mm/gup: reintroduce pin_user_pages_fast_only() Like pin_user_pages_fast(), but with the internal-only FOLL_FAST_ONLY flag. This complements the get_user_pages*() API, which already has get_user_pages_fast_only(). Note that pin_user_pages_fast_only() used to exist but was removed in upstream commit edad1bb1fbf7 ("mm/gup: remove pin_user_pages_fast_only()") due to it not having any users. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- include/linux/mm.h | 2 ++ mm/gup.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index ab100f6bd25ad..3f1772eb24261 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2571,6 +2571,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); diff --git a/mm/gup.c b/mm/gup.c index 16cdddef91585..40b8a69fc312f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3359,6 +3359,20 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior is + * the same, except that this one sets FOLL_PIN instead of FOLL_GET. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + if (!is_valid_gup_args(pages, NULL, &gup_flags, + FOLL_PIN | FOLL_FAST_ONLY)) + return -EINVAL; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process * From 5023d245bd9cbe2e939b58670f94f613ee202b52 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 19 Aug 2025 11:30:03 -0700 Subject: [PATCH 22/23] crypto: rng - Implement fast per-CPU DRBG instances When the kernel is booted with fips=1, the RNG exposed to userspace is hijacked away from the CRNG and redirects to crypto_devrandom_read_iter(), which utilizes the DRBG. Notably, crypto_devrandom_read_iter() maintains just two global DRBG instances _for the entire system_, and the two instances serve separate request types: one instance for GRND_RANDOM requests (crypto_reseed_rng), and one instance for non-GRND_RANDOM requests (crypto_default_rng). So in essence, for requests of a single type, there is just one global RNG for all CPUs in the entire system, which scales _very_ poorly. To make matters worse, the temporary buffer used to ferry data between the DRBG and userspace is woefully small at only 256 bytes, which doesn't do a good job of maximizing throughput from the DRBG. This results in lost performance when userspace requests >256 bytes; it is observed that DRBG throughput improves by 70% on an i9-13900H when the buffer size is increased to 4096 bytes (one page). Going beyond the size of one page up to the DRBG maximum request limit of 65536 bytes produces diminishing returns of only 3% improved throughput in comparison. And going below the size of one page produces progressively less throughput at each power of 2: there's a 5% loss going from 4096 bytes to 2048 bytes and a 9% loss going from 2048 bytes to 1024 bytes. Thus, this implements per-CPU DRBG instances utilizing a page-sized buffer for each CPU to utilize the DRBG itself more effectively. On top of that, for non-GRND_RANDOM requests, the DRBG's operations now occur under a local lock that disables preemption on non-PREEMPT_RT kernels, which not only keeps each CPU's DRBG instance isolated from another, but also improves temporal cache locality while the DRBG actively generates a new string of random bytes. Prefaulting one user destination page at a time is also employed to prevent a DRBG instance from getting blocked on page faults, thereby maximizing the use of the DRBG so that the only bottleneck is the DRBG itself. Signed-off-by: Sultan Alsawaf Signed-off-by: Jonathan Maple --- crypto/rng.c | 452 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 373 insertions(+), 79 deletions(-) diff --git a/crypto/rng.c b/crypto/rng.c index 879a3f42a9d49..011a510492580 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -6,6 +6,9 @@ * * Copyright (c) 2008 Neil Horman * Copyright (c) 2015 Herbert Xu + * + * Copyright (C) 2025 Ctrl IQ, Inc. + * Author: Sultan Alsawaf */ #include @@ -17,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -26,14 +28,40 @@ #include "internal.h" -static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock); -static struct crypto_rng *crypto_reseed_rng; static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); static unsigned int crypto_default_rng_refcnt; static bool drbg_registered __ro_after_init; +/* + * Per-CPU RNG instances are only used by crypto_devrandom_rng. The global RNG, + * crypto_default_rng, is only used directly by other drivers. + * + * Per-CPU instances of the DRBG are efficient because the DRBG itself supports + * an arbitrary number of instances and can be seeded on a per-CPU basis. + * + * Specifically, the DRBG is seeded by the CRNG and the Jitter RNG. The CRNG is + * globally accessible and is already per-CPU. And while the Jitter RNG _isn't_ + * per-CPU, creating a DRBG instance also creates a Jitter RNG instance; + * therefore, per-CPU DRBG instances implies per-CPU Jitter RNG instances. + */ +struct cpu_rng_inst { + local_lock_t lock; + struct rt_mutex mlock; + struct crypto_rng *rng; + void *page; +}; + +static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_default_rng) = { + .lock = INIT_LOCAL_LOCK(pcpu_default_rng.lock), + .mlock = __RT_MUTEX_INITIALIZER(pcpu_default_rng.mlock) +}; +static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_reseed_rng) = { + /* The reseed instances don't use the local lock */ + .mlock = __RT_MUTEX_INITIALIZER(pcpu_reseed_rng.mlock) +}; + int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { struct crypto_alg *alg = tfm->base.__crt_alg; @@ -165,32 +193,18 @@ void crypto_put_default_rng(void) EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) -static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp, - struct rt_mutex *lock) +int crypto_del_default_rng(void) { - int err = -EBUSY; + bool busy; - rt_mutex_lock(lock); - if (refcntp && *refcntp) - goto out; - - crypto_free_rng(*rngp); - *rngp = NULL; - - err = 0; - -out: - rt_mutex_unlock(lock); - - return err; -} + rt_mutex_lock(&crypto_default_rng_lock); + if (!(busy = crypto_default_rng_refcnt)) { + crypto_free_rng(crypto_default_rng); + crypto_default_rng = NULL; + } + rt_mutex_unlock(&crypto_default_rng_lock); -int crypto_del_default_rng(void) -{ - return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt, - &crypto_default_rng_lock) ?: - crypto_del_rng(&crypto_reseed_rng, NULL, - &crypto_reseed_rng_lock); + return busy ? -EBUSY : 0; } EXPORT_SYMBOL_GPL(crypto_del_default_rng); #endif @@ -270,80 +284,338 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count) } EXPORT_SYMBOL_GPL(crypto_unregister_rngs); -static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) +/* + * On non-PREEMPT_RT kernels, local locks disable preemption. When there's no + * rng allocated, one must be allocated by calling crypto_get_rng(), which can + * sleep. Therefore, crypto_get_rng() cannot be called under local_lock(), so if + * our CPU's RNG instance doesn't have an rng allocated, we drop the local lock + * and take a mutex lock instead. After the local lock is dropped, the current + * task can be freely migrated to another CPU, which means that calling + * local_lock() again might not result in the same instance getting locked as + * before. That's why this function exists: to loop on calling local_lock() and + * allocating an rng as needed with crypto_get_rng() until the current CPU's + * instance is found to have an rng allocated. If crypto_get_rng() ever fails, + * this function returns an error even if there are instances for other CPUs + * which _do_ have an rng allocated. + */ +static __always_inline struct cpu_rng_inst * +lock_default_rng(struct crypto_rng **rng) __acquires(&cri->lock) { - struct crypto_rng *rng; - u8 tmp[256]; - ssize_t ret; + struct cpu_rng_inst __percpu *pcri = &pcpu_default_rng; + struct cpu_rng_inst *cri; + int ret; + + while (1) { + local_lock(&pcri->lock); + cri = this_cpu_ptr(pcri); + /* + * cri->rng can only transition from NULL to non-NULL. This may + * occur on a different CPU, thus cri->rng must be read + * atomically to prevent data races; this elides mlock by + * pairing with the WRITE_ONCE() in the slow path below. + * + * And if cri->rng is non-NULL, then it is good to go. To avoid + * data races due to load speculation on torn cri->rng loads + * _after_ the NULL check, one of the following is required: + * 1. smp_acquire__after_ctrl_dep() in the if-statement + * 2. All cri->rng reads are performed with READ_ONCE() + * 3. cri->rng is never read again outside this function + * + * Option #3 yields the best performance, so this function + * provides the rng pointer as an output for the caller to use. + */ + *rng = READ_ONCE(cri->rng); + if (likely(*rng)) + return cri; + + /* + * Slow path: there's no rng currently allocated to this instance. + * Release the local lock and acquire this instance's mlock to + * perform the allocation. + * + * Note that this task may be migrated to a different CPU now! + */ + local_unlock(&cri->lock); + rt_mutex_lock(&cri->mlock); + if (!cri->rng) { + struct crypto_rng *new_rng = NULL; + + ret = crypto_get_rng(&new_rng); + if (ret) { + rt_mutex_unlock(&cri->mlock); + break; + } - if (unlikely(!iov_iter_count(iter))) - return 0; + /* + * Pairs with READ_ONCE() above, because we might not be + * on the same CPU anymore as when we first got `cri`. + */ + WRITE_ONCE(cri->rng, new_rng); + } + rt_mutex_unlock(&cri->mlock); + } - if (reseed) { - u32 flags = 0; + /* + * Even if this task got migrated to another CPU that _does_ have an rng + * allocated, just bail out if crypto_get_rng() ever fails in order to + * avoid looping forever. + */ + return ERR_PTR(ret); +} - /* If reseeding is requested, acquire a lock on - * crypto_reseed_rng so it is not swapped out until - * the initial random bytes are generated. - * - * The algorithm implementation is also protected with - * a separate mutex (drbg->drbg_mutex) around the - * reseed-and-generate operation. - */ - rt_mutex_lock(&crypto_reseed_rng_lock); +static __always_inline struct cpu_rng_inst * +lock_reseed_rng(struct crypto_rng **rng) __acquires(&cri->mlock) +{ + struct cpu_rng_inst __percpu *pcri = &pcpu_reseed_rng; + struct cpu_rng_inst *cri; + int ret; - /* If crypto_default_rng is not set, it will be seeded - * at creation in __crypto_get_default_rng and thus no - * reseeding is needed. + /* + * Use whichever CPU this task is currently running on, knowing full + * well that the task can freely migrate to other CPUs. The reseed RNG + * requires holding a lock across the entire devrandom read, so that + * another task cannot extract entropy from the same seed. In other + * words, when reseeding is requested, reseeding must be done every time + * every time mlock is acquired. + */ + cri = raw_cpu_ptr(pcri); + rt_mutex_lock(&cri->mlock); + if (likely(cri->rng)) { + /* + * Since this rng instance wasn't just allocated, it needs to be + * explicitly reseeded. New rng instances are seeded on creation + * in crypto_get_rng() and thus don't need explicit reseeding. */ - if (crypto_reseed_rng) - flags |= CRYPTO_TFM_REQ_NEED_RESEED; - - ret = crypto_get_rng(&crypto_reseed_rng); + crypto_tfm_set_flags(crypto_rng_tfm(cri->rng), + CRYPTO_TFM_REQ_NEED_RESEED); + } else { + ret = crypto_get_rng(&cri->rng); if (ret) { - rt_mutex_unlock(&crypto_reseed_rng_lock); - return ret; + rt_mutex_unlock(&cri->mlock); + return ERR_PTR(ret); } + } - rng = crypto_reseed_rng; - crypto_tfm_set_flags(crypto_rng_tfm(rng), flags); - } else { - ret = crypto_get_default_rng(); - if (ret) - return ret; - rng = crypto_default_rng; + *rng = cri->rng; + return cri; +} + +#define lock_local_rng(rng, reseed) \ + ({ (reseed) ? lock_reseed_rng(rng) : lock_default_rng(rng); }) + +#define unlock_local_rng(cri, reseed) \ +do { \ + if (reseed) \ + rt_mutex_unlock(&(cri)->mlock); \ + else \ + local_unlock(&(cri)->lock); \ +} while (0) + +static __always_inline void +clear_rng_page(struct cpu_rng_inst *cri, size_t count) +{ + /* For zeroing a whole page, clear_page() is faster than memset() */ + count < PAGE_SIZE ? memset(cri->page, 0, count) : clear_page(cri->page); +} + +static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) +{ + /* lock_local_rng() puts us in atomic context for !reseed on non-RT */ + const bool atomic = !reseed && !IS_ENABLED(CONFIG_PREEMPT_RT); + const bool user_no_reseed = !reseed && user_backed_iter(iter); + size_t ulen, page_dirty_len = 0; + struct cpu_rng_inst *cri; + struct crypto_rng *rng; + void __user *uaddr; + struct page *upage; + ssize_t ret = 0; + + if (unlikely(!iov_iter_count(iter))) + return 0; + + /* Set up the starting user destination address and length */ + if (user_no_reseed) { + if (iter_is_ubuf(iter)) { + uaddr = iter->ubuf + iter->iov_offset; + ulen = iov_iter_count(iter); + } else if (iter_is_iovec(iter)) { + uaddr = iter_iov_addr(iter); + ulen = iter_iov_len(iter); + } else { + /* + * ITER_UBUF and ITER_IOVEC are the only user-backed + * iters. Bug out if a new user-backed iter appears. + */ + BUG(); + } } - for (;;) { - size_t i, copied; +restart: + /* + * Pin the user page backing the current user destination address, + * potentially prefaulting to allocate a page for the destination. By + * prefaulting without the RNG lock held, the DRBG won't be blocked by + * time spent on page faults for this task, and thus the DRBG can still + * be used by other tasks. + */ + if (user_no_reseed && pin_user_pages_fast((unsigned long)uaddr, 1, + FOLL_WRITE, &upage) != 1) + goto exit; + + cri = lock_local_rng(&rng, reseed); + if (IS_ERR(cri)) { + if (!ret) + ret = PTR_ERR(cri); + goto unpin_upage; + } + + while (1) { + size_t copied, i = min(iov_iter_count(iter), PAGE_SIZE); + bool resched_without_lock = false; int err; - i = min_t(size_t, iov_iter_count(iter), sizeof(tmp)); - err = crypto_rng_get_bytes(rng, tmp, i); + /* + * Generate up to one page at a time, and align to a page + * boundary so we only need to pin one user page at a time. + */ + if (user_no_reseed) + i = min3(i, PAGE_SIZE - offset_in_page(uaddr), ulen); + + /* + * On non-PREEMPT_RT kernels, local locks disable preemption. + * The DRBG's generate() function has a mutex lock, which could + * mean that we'll schedule while atomic if the mutex lock + * sleeps. However, that will never happen if we ensure that + * there's never any contention on the DRBG's mutex lock while + * we're atomic! Our local lock ensures calls to the DRBG are + * always serialized, so there's no contention from here. And + * the DRBG only uses its mutex lock from one other path, when + * an instance of the DRBG is freshly allocated, which we only + * do from crypto_get_rng(). So the DRBG's mutex lock is + * guaranteed to not have contention when we call generate() and + * thus it'll never sleep here. And of course, nothing else in + * generate() ever sleeps. + */ + err = crypto_rng_get_bytes(rng, cri->page, i); if (err) { - ret = ret ?: err; + if (!ret) + ret = err; break; } - copied = copy_to_iter(tmp, i, iter); - ret += copied; + /* + * Record the number of bytes used in cri->page and either copy + * directly to the user address without faulting, or copy to the + * iter which is always backed by kernel memory when !reseed && + * !user_backed_iter(). When reseed == true, the iter may be + * backed by user memory, but we copy to it with the possibility + * of page faults anyway because we need to hold the lock across + * the entire call; this is why a mutex is used instead of a + * local lock for the reseed RNG, to permit sleeping without + * yielding the DRBG instance. + */ + page_dirty_len = max(i, page_dirty_len); + if (user_no_reseed) { + err = copy_to_user_nofault(uaddr, cri->page, i); + if (err >= 0) { + iov_iter_advance(iter, i - err); + ret += i - err; + } + if (err) + break; + } else { + /* + * We know that copying from cri->page is safe, so use + * _copy_to_iter() directly to skip check_copy_size(). + */ + copied = _copy_to_iter(cri->page, i, iter); + ret += copied; + if (copied != i) + break; + } - if (!iov_iter_count(iter) || copied != i) + /* + * Quit when either the requested number of bytes have been + * generated or there is a pending signal. + */ + if (!iov_iter_count(iter) || signal_pending(current)) break; - BUILD_BUG_ON(PAGE_SIZE % sizeof(tmp) != 0); - if (ret % PAGE_SIZE == 0) { - if (signal_pending(current)) - break; - cond_resched(); + /* Compute the next user destination address and length */ + if (user_no_reseed) { + ulen -= i; + if (likely(ulen)) { + uaddr += i; + } else { + /* + * This path is only reachable by ITER_IOVEC + * because ulen is initialized to the request + * size for ITER_UBUF, and therefore ITER_UBUF + * will always quit at the iov_iter_count() + * check above before ulen can become zero. + * + * iter->iov_offset is guaranteed to be zero + * here, so iter_iov_{addr|len}() isn't needed. + */ + uaddr = iter_iov(iter)->iov_base; + ulen = iter_iov(iter)->iov_len; + } + + unpin_user_page(upage); + } + + /* + * Reschedule right now if needed and we're not atomic. If we're + * atomic, then we must first drop the lock to reschedule. + */ + if (need_resched()) { + if (atomic) + resched_without_lock = true; + else + cond_resched(); + } + + /* + * Optimistically try to pin the next user page without + * faulting, so we don't need to clear cri->page and drop the + * lock on every iteration. If this fails, we fall back to + * pinning with the option to prefault. + */ + if (user_no_reseed && !resched_without_lock && + pin_user_pages_fast_only((unsigned long)uaddr, 1, + FOLL_WRITE, &upage) == 1) + continue; + + /* + * Restart if either rescheduling is needed (and requires + * dropping the lock since we're atomic) or the optimistic page + * pinning attempt failed. + * + * This always implies `reseed == false`, so unlock_local_rng() + * can just be passed `false` for reseed to eliminate a branch. + */ + if (resched_without_lock || user_no_reseed) { + /* + * Clear the buffer of our latest random bytes before + * unlocking and potentially migrating CPUs, in which + * case we wouldn't have the same `cri` anymore. + */ + clear_rng_page(cri, page_dirty_len); + unlock_local_rng(cri, false); + page_dirty_len = 0; + if (resched_without_lock) + cond_resched(); + goto restart; } } - if (reseed) - rt_mutex_unlock(&crypto_reseed_rng_lock); - else - crypto_put_default_rng(); - memzero_explicit(tmp, sizeof(tmp)); + if (page_dirty_len) + clear_rng_page(cri, page_dirty_len); + unlock_local_rng(cri, reseed); +unpin_upage: + if (user_no_reseed) + unpin_user_page(upage); +exit: return ret ? ret : -EFAULT; } @@ -351,10 +623,32 @@ static const struct random_extrng crypto_devrandom_rng = { .extrng_read_iter = crypto_devrandom_read_iter }; +static void __init alloc_pcpu_inst(struct cpu_rng_inst __percpu *pcri) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu); + + cri->page = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); + local_lock_init(&cri->lock); + } +} + static int __init crypto_rng_init(void) { - if (fips_enabled) - random_register_extrng(&crypto_devrandom_rng); + if (!fips_enabled) + return 0; + + /* + * Never fail to register the RNG override in FIPS mode because failure + * would result in the system quietly booting without the FIPS-mandated + * RNG installed. This would be catastrophic for FIPS compliance, hence + * the RNG override setup is *not* allowed to fail. + */ + alloc_pcpu_inst(&pcpu_default_rng); + alloc_pcpu_inst(&pcpu_reseed_rng); + random_register_extrng(&crypto_devrandom_rng); return 0; } From 1165b486bc2339f5c15874de145478cb031fcf3f Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 29 Jul 2025 12:12:38 -0400 Subject: [PATCH 23/23] configs: Ensure FIPS settings defined We want to hard set the x86_64 FIPS required configs rather than rely on default settings in the kernel, should these ever change without our knowing it would not be something we would have actively checked. The configs are a limited set of configs that is expanded out when building using `make olddefconfig` a common practice in kernel building. Note had to manually add the following since its normaly set by the RPM build process. CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API" Signed-off-by: Jonathan Maple --- configs/kernel-x86_64-debug-rhel.config | 11 +++++++++++ configs/kernel-x86_64-rhel.config | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/configs/kernel-x86_64-debug-rhel.config b/configs/kernel-x86_64-debug-rhel.config index 936a8a88f0dea..a9072995d48e8 100644 --- a/configs/kernel-x86_64-debug-rhel.config +++ b/configs/kernel-x86_64-debug-rhel.config @@ -7194,3 +7194,14 @@ CONFIG_ZSWAP=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD is not set CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set + +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_FIPS_SIGNATURE_SELFTEST=y +CONFIG_FIPS_SIGNATURE_SELFTEST_RSA=y +CONFIG_FIPS_SIGNATURE_SELFTEST_ECDSA=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y +CONFIG_CRYPTO_FIPS_VERSION="rocky9.20250725" +CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API" diff --git a/configs/kernel-x86_64-rhel.config b/configs/kernel-x86_64-rhel.config index 9e3e16ae4db02..eb94c7c18f569 100644 --- a/configs/kernel-x86_64-rhel.config +++ b/configs/kernel-x86_64-rhel.config @@ -7171,3 +7171,14 @@ CONFIG_ZSWAP=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD is not set CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set + +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_FIPS_SIGNATURE_SELFTEST=y +CONFIG_FIPS_SIGNATURE_SELFTEST_RSA=y +CONFIG_FIPS_SIGNATURE_SELFTEST_ECDSA=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_FIPS=y +CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y +CONFIG_CRYPTO_FIPS_VERSION="rocky9.20250725" +CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API"