From 3b0a2ef8b62d4d9d325725b31831484d3e4993f0 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 2 Dec 2025 14:43:49 +0000 Subject: [PATCH 1/2] secret-hiding: Allow to have cover letter Since the cover letter does not include any code change, the git apply command denies it by default. But the cover letter would be helpful for readers since it gives many information (including background, motivation, design decision, etc.). To allow to have the cover letter, add --apply-empty option to the git apply command. Signed-off-by: Takahiro Itazuri --- resources/hiding_ci/build_and_install_kernel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 4b35ad08a7d..ea87cc7450a 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -79,7 +79,7 @@ confirm() { apply_patch_file() { echo "Applying patch:" $(basename $1) - git apply $1 + git apply --allow-empty $1 } apply_patch_or_series() { From f0886caf1271a2f9601e344674eee1e802b4c59d Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 2 Dec 2025 14:41:06 +0000 Subject: [PATCH 2/2] secret-hiding: Update kernel patches for kvm-clock We used a very ad-hoc solution for kvm-clock. The new kernel patches make gfn_to_pfn_cache (that kvm-clock is based on) work for guest_memfd without the direct map. Signed-off-by: Takahiro Itazuri --- .../11-kvm-clock/0000-cover-letter.patch | 70 +++++++++++ ...-kvm_gmem_get_pfn-for-guest_memfd-ba.patch | 99 +++++++++++++++ ...-for-kvm-clock-if-kvm_gpc_refresh-fa.patch | 103 ---------------- ...-vmap-for-guest_memfd-pages-without-.patch | 115 ++++++++++++++++++ 4 files changed, 284 insertions(+), 103 deletions(-) create mode 100644 resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch create mode 100644 resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch delete mode 100644 resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch create mode 100644 resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch new file mode 100644 index 00000000000..08d85b55501 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch @@ -0,0 +1,70 @@ +From 363385a3c2cd4f7fe445ed71329e55d190cb14d5 Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Tue, 2 Dec 2025 12:15:49 +0000 +Subject: [RFC PATCH 0/2] KVM: pfncache: Support guest_memfd without direct map + +[ based on kvm/next with [1] ] + +Recent work on guest_memfd [1] is introducing support for removing guest +memory from the kernel direct map (Note that it hasn't been merged yet, +and that is why this patch series is labelled RFC). The feature is +useful for non-CoCo VMs to prevent the host kernel from accidentally or +speculatively accessing guest memory as a general safety improvement. +Pages for guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP have +their direct-map PTEs explicitly disabled, and thus cannot rely on the +direct map. + +This breaks the facilities that use gfn_to_pfn_cache, including +kvm-clock. gfn_to_pfn_cache caches the pfn and kernel host virtual +address (khva) for a given gfn so that KVM can repeatedly read or write +the corresponding guest page. The cached khva may be later dereferenced +from atomic contexts in some cases. Such contexts cannot tolerate +sleeping or page faults, and therefore cannot use the userspace mapping +(uhva), as those mappings may fault at any time. As a result, +gfn_to_pfn_cache requires a stable, fault-free kernel virtual address +for the backing pages, independent of the userspace page. + +This small patch series enables gfn_to_pfn_cache to work correctly when +a memslot is backed by guest_memfd with GUEST_MEMFD_FLAG_NO_DIRECT_MAP. +The first patch teaches gfn_to_pfn_cache to obtain pfn for guest_memfd- +backed memslots via kvm_gmem_get_pfn() instead of GUP (hva_to_pfn()). +The second patch makes gfn_to_pfn_cache use vmap()/vunmap() to create a +fault-free kernel address for such pages. We believe that establishing +such mapping for paravirtual guest/host communication is acceptable +since such pages do not contain sensitive data. + +Another considered idea was to use memremap() instead of vmap(), since +gpc_map() already falls back to memremap() if pfn_valid() is false. +However, vmap() was chosen for the following reason. memremap() with +MEMREMAP_WB first attempts to use the direct map via try_ram_remap(), +and then falls back to arch_memremap_wb(), which explicitly refuses to +map system RAM. It would be possible to relax this restriction, but the +side effects are unclear because memremap() is widely used throughout +the kernel. Changing memremap() to support system RAM without the +direct map solely for gfn_to_pfn_cache feels disproportionate. If +additional users appear that need to map system RAM without the direct +map, revisiting and generalizing memremap() might make sense. For now, +vmap()/vunmap() provides a contained and predictable solution. + +A possible approach in the future is to use the "ephmap" (or proclocal) +proposed in [2], but it is not yet clear when that work will be merged. +In constrast, the changes in this patch series are small and +self-contained, yet immediately allow gfn_to_pfn_cache (including +kvm-clock) to operate correctly with direct map-removed guest_memfd. +Once ephmap eventually is merged, gfn_to_pfn_cache can be updated to +make use of it as appropriate. + +[1]: https://lore.kernel.org/all/20250924151101.2225820-1-patrick.roy@campus.lmu.de/ +[2]: https://lore.kernel.org/all/20250812173109.295750-1-jackmanb@google.com/ + +Takahiro Itazuri (2): + KVM: pfncache: Use kvm_gmem_get_pfn() for guest_memfd-backed memslots + KVM: pfncache: Use vmap() for guest_memfd pages without direct map + + include/linux/kvm_host.h | 7 ++++++ + virt/kvm/pfncache.c | 52 +++++++++++++++++++++++++++++----------- + 2 files changed, 45 insertions(+), 14 deletions(-) + +-- +2.50.1 + diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch new file mode 100644 index 00000000000..cec20a80cc4 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch @@ -0,0 +1,99 @@ +From bebfd5914aae9d2eaec6b90b6408875f2aa40610 Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Mon, 1 Dec 2025 14:58:44 +0000 +Subject: [RFC PATCH 1/2] KVM: pfncache: Use kvm_gmem_get_pfn() for guest_memfd-backed memslots + +gfn_to_pfn_cache currently relies on hva_to_pfn(), which resolves PFNs +through GUP. GUP assumes that the page has a valid direct-map PTE, +which is not true for guest_memfd created with +GUEST_MEMFD_FLAG_NO_DIRECT_MAP, because their direct-map PTEs are +explicitly removed via set_direct_map_valid_noflush(). + +Introduce a helper function, gpc_to_pfn(), that routes PFN lookup to +kvm_gmem_get_pfn() for guest_memfd-backed memslots (regardless of +whether GUEST_MEMFD_FLAG_NO_DIRECT_MAP is set), and otherwise falls +back to the existing hva_to_pfn() path. Rename hva_to_pfn_retry() to +gpc_to_pfn_retry() accordingly. + +Signed-off-by: Takahiro Itazuri +--- + virt/kvm/pfncache.c | 34 +++++++++++++++++++++++----------- + 1 file changed, 23 insertions(+), 11 deletions(-) + +diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c +index 728d2c1b488a..bf8d6090e283 100644 +--- a/virt/kvm/pfncache.c ++++ b/virt/kvm/pfncache.c +@@ -152,22 +152,34 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s + return kvm->mmu_invalidate_seq != mmu_seq; + } + +-static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) ++static kvm_pfn_t gpc_to_pfn(struct gfn_to_pfn_cache *gpc, struct page **page) + { +- /* Note, the new page offset may be different than the old! */ +- void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); +- kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; +- void *new_khva = NULL; +- unsigned long mmu_seq; +- struct page *page; ++ if (kvm_slot_has_gmem(gpc->memslot)) { ++ kvm_pfn_t pfn; ++ ++ kvm_gmem_get_pfn(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa), ++ &pfn, page, NULL); ++ return pfn; ++ } + + struct kvm_follow_pfn kfp = { + .slot = gpc->memslot, + .gfn = gpa_to_gfn(gpc->gpa), + .flags = FOLL_WRITE, + .hva = gpc->uhva, +- .refcounted_page = &page, ++ .refcounted_page = page, + }; ++ return hva_to_pfn(&kfp); ++} ++ ++static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) ++{ ++ /* Note, the new page offset may be different than the old! */ ++ void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); ++ kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; ++ void *new_khva = NULL; ++ unsigned long mmu_seq; ++ struct page *page; + + lockdep_assert_held(&gpc->refresh_lock); + +@@ -206,7 +218,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) + cond_resched(); + } + +- new_pfn = hva_to_pfn(&kfp); ++ new_pfn = gpc_to_pfn(gpc, &page); + if (is_error_noslot_pfn(new_pfn)) + goto out_error; + +@@ -319,7 +331,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l + } + } + +- /* Note: the offset must be correct before calling hva_to_pfn_retry() */ ++ /* Note: the offset must be correct before calling gpc_to_pfn_retry() */ + gpc->uhva += page_offset; + + /* +@@ -327,7 +339,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l + * drop the lock and do the HVA to PFN lookup again. + */ + if (!gpc->valid || hva_change) { +- ret = hva_to_pfn_retry(gpc); ++ ret = gpc_to_pfn_retry(gpc); + } else { + /* + * If the HVA→PFN mapping was already valid, don't unmap it. +-- +2.50.1 + diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch deleted file mode 100644 index 755f1c0c73c..00000000000 --- a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0a04094c8b7e292fcb7bdf8528d70baddbfff379 Mon Sep 17 00:00:00 2001 -From: Patrick Roy -Date: Fri, 18 Jul 2025 15:59:39 +0100 -Subject: [PATCH 01/15] KVM: x86: use uhva for kvm-clock if kvm_gpc_refresh() - fails - -kvm-clock uses a gfn_to_pfn_cache to avoid repeated gpa->pfn -computations, relying on mmu notifiers to determine when the translation -needs to be redone. - -If the guest places the kvm-clock for some vcpu into memory that is -backed by a KVM_MEMSLOT_GMEM_ONLY memslot, and the guest_memfd instance -has GUEST_MEMFD_FLAG_NO_DIRECT_MAP set, this does not work: -gfn_to_pfn_cache internally uses GUP to resolve uhva->pfn, which -returned -EFAULT for direct map removed memory. But even if this pfn -computation were to work, the subsequent attempts to access guest memory -through the direct map would obviously fail. - -For this scenario, all other parts of kvm fall back to instead accessing -guest memory through userspace mapping of guest_memfd, which is stored -in the memslots userspace_addr. Have kvm-clock do the same by handling -failures in kvm_gpc_refresh() with a fallback to a pvclock update -routine that operates on userspace mappings. This looses the -optimization of gfn_to_pfn_cache for these VMs, but on modern hardawre -kvm-clock update requests should be rare enough for this to not matter -(and guest_memfd is not support for Xen VMs, where speed of pvclock -accesses is more relevant). - -Alternatively, it would be possible to team gfn_to_pfn_cache about -(direct map removed) guest_memfd, however the combination of on-demand -direct map reinsertion (and its induced ref-counting) and hooking -gfn_to_pfn_caches up to gmem invalidations has proven significantly more -complex [1], and hence simply falling back to userspace mappings was -suggested by Sean at one of the guest_memfd upstream calls. - -[1]: https://lore.kernel.org/kvm/20240910163038.1298452-9-roypat@amazon.co.uk/ - https://lore.kernel.org/kvm/20240910163038.1298452-10-roypat@amazon.co.uk/ - -Signed-off-by: Patrick Roy ---- - arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++++++++++++++++- - 1 file changed, 37 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index 33fba801b205..c8fd35c1bbda 100644 ---- a/arch/x86/kvm/x86.c -+++ b/arch/x86/kvm/x86.c -@@ -3149,6 +3149,40 @@ u64 get_kvmclock_ns(struct kvm *kvm) - return data.clock; - } - -+static void kvm_setup_guest_pvclock_slow(struct pvclock_vcpu_time_info *ref_hv_clock, -+ struct kvm_vcpu *vcpu, -+ gpa_t gpa) -+{ -+ struct pvclock_vcpu_time_info guest_hv_clock; -+ struct pvclock_vcpu_time_info hv_clock; -+ -+ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); -+ -+ kvm_read_guest(vcpu->kvm, gpa, &guest_hv_clock, sizeof(struct pvclock_vcpu_time_info)); -+ -+ /* -+ * This VCPU is paused, but it's legal for a guest to read another -+ * VCPU's kvmclock, so we really have to follow the specification where -+ * it says that version is odd if data is being modified, and even after -+ * it is consistent. -+ */ -+ -+ guest_hv_clock.version = hv_clock.version = (guest_hv_clock.version + 1) | 1; -+ smp_wmb(); -+ -+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ -+ hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); -+ -+ kvm_write_guest(vcpu->kvm, gpa, &hv_clock, sizeof(struct pvclock_vcpu_time_info)); -+ -+ smp_wmb(); -+ -+ ++hv_clock.version; -+ kvm_write_guest(vcpu->kvm, gpa + offsetof(struct pvclock_vcpu_time_info, version), &hv_clock.version, sizeof(hv_clock.version)); -+ -+ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); -+} -+ - static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, - struct kvm_vcpu *vcpu, - struct gfn_to_pfn_cache *gpc, -@@ -3164,8 +3198,10 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, - while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { - read_unlock_irqrestore(&gpc->lock, flags); - -- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) -+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) { -+ kvm_setup_guest_pvclock_slow(ref_hv_clock, vcpu, gpc->gpa + offset); - return; -+ } - - read_lock_irqsave(&gpc->lock, flags); - } --- -2.51.0 - diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch new file mode 100644 index 00000000000..bc08c508086 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch @@ -0,0 +1,115 @@ +From 6c90c75b15ba48cc5e0a1e74224c041c3c45668b Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Mon, 1 Dec 2025 16:47:05 +0000 +Subject: [RFC PATCH 2/2] KVM: pfncache: Use vmap() for guest_memfd pages without direct map + +gfn_to_pfn_cache currently maps RAM PFNs with kmap(), which relies on +the direct map. guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP +disable their direct-map PTEs via set_direct_map_valid_noflush(), so the +linear address returned by kmap()/page_address() will fault if +dereferenced. + +In some cases, gfn_to_pfn_cache dereferences the cached kernel address +(khva) from atomic contexts where page faults cannot be tolerated. +Therefore khva must always refer to a fault-free kernel mapping. Since +mapping and unmapping happen exclusively in the refresh path, which may +sleep, using vmap()/vunmap() for these pages is safe and sufficient. + +Introduce kvm_slot_no_direct_map() to detect guest_memfd slots without +the direct map, and make gpc_map()/gpc_unmap() use vmap()/vunmap() for +such pages. + +This allows the facilities based on gfn_to_pfn_cache (e.g. kvm-clock) to +work correctly with guest_memfd regardless of whether its direct-map +PTEs are valid. + +Signed-off-by: Takahiro Itazuri +--- + include/linux/kvm_host.h | 7 +++++++ + virt/kvm/pfncache.c | 26 ++++++++++++++++++++------ + 2 files changed, 27 insertions(+), 6 deletions(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 70e6a5210ceb..793d98f97928 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -628,6 +629,12 @@ static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *sl + return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; + } + ++static inline bool kvm_slot_no_direct_map(const struct kvm_memory_slot *slot) ++{ ++ return slot && kvm_slot_has_gmem(slot) && ++ mapping_no_direct_map(slot->gmem.file->f_mapping); ++} ++ + static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) + { + return ALIGN(memslot->npages, BITS_PER_LONG) / 8; +diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c +index bf8d6090e283..ae6d8699e536 100644 +--- a/virt/kvm/pfncache.c ++++ b/virt/kvm/pfncache.c +@@ -96,10 +96,16 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) + return true; + } + +-static void *gpc_map(kvm_pfn_t pfn) ++static void *gpc_map(struct gfn_to_pfn_cache *gpc, kvm_pfn_t pfn) + { +- if (pfn_valid(pfn)) +- return kmap(pfn_to_page(pfn)); ++ if (pfn_valid(pfn)) { ++ struct page *page = pfn_to_page(pfn); ++ ++ if (kvm_slot_no_direct_map(gpc->memslot)) ++ return vmap(&page, 1, VM_MAP, PAGE_KERNEL); ++ ++ return kmap(page); ++ } + + #ifdef CONFIG_HAS_IOMEM + return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); +@@ -115,6 +121,11 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) + return; + + if (pfn_valid(pfn)) { ++ if (is_vmalloc_addr(khva)) { ++ vunmap(khva); ++ return; ++ } ++ + kunmap(pfn_to_page(pfn)); + return; + } +@@ -224,13 +235,16 @@ static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) + + /* + * Obtain a new kernel mapping if KVM itself will access the +- * pfn. Note, kmap() and memremap() can both sleep, so this +- * too must be done outside of gpc->lock! ++ * pfn. Note, kmap(), vmap() and memremap() can sleep, so this ++ * too must be done outside of gpc->lock! Note that even though ++ * the rwlock is dropped, it's still fine to access gpc->pfn and ++ * other fields because gpc->fresh_lock mutex prevents those ++ * from being changed. + */ + if (new_pfn == gpc->pfn) + new_khva = old_khva; + else +- new_khva = gpc_map(new_pfn); ++ new_khva = gpc_map(gpc, new_pfn); + + if (!new_khva) { + kvm_release_page_unused(page); +-- +2.50.1 +