diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 4b35ad08a7d..ea87cc7450a 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -79,7 +79,7 @@ confirm() { apply_patch_file() { echo "Applying patch:" $(basename $1) - git apply $1 + git apply --allow-empty $1 } apply_patch_or_series() { diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch new file mode 100644 index 00000000000..08d85b55501 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0000-cover-letter.patch @@ -0,0 +1,70 @@ +From 363385a3c2cd4f7fe445ed71329e55d190cb14d5 Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Tue, 2 Dec 2025 12:15:49 +0000 +Subject: [RFC PATCH 0/2] KVM: pfncache: Support guest_memfd without direct map + +[ based on kvm/next with [1] ] + +Recent work on guest_memfd [1] is introducing support for removing guest +memory from the kernel direct map (Note that it hasn't been merged yet, +and that is why this patch series is labelled RFC). The feature is +useful for non-CoCo VMs to prevent the host kernel from accidentally or +speculatively accessing guest memory as a general safety improvement. +Pages for guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP have +their direct-map PTEs explicitly disabled, and thus cannot rely on the +direct map. + +This breaks the facilities that use gfn_to_pfn_cache, including +kvm-clock. gfn_to_pfn_cache caches the pfn and kernel host virtual +address (khva) for a given gfn so that KVM can repeatedly read or write +the corresponding guest page. The cached khva may be later dereferenced +from atomic contexts in some cases. Such contexts cannot tolerate +sleeping or page faults, and therefore cannot use the userspace mapping +(uhva), as those mappings may fault at any time. As a result, +gfn_to_pfn_cache requires a stable, fault-free kernel virtual address +for the backing pages, independent of the userspace page. + +This small patch series enables gfn_to_pfn_cache to work correctly when +a memslot is backed by guest_memfd with GUEST_MEMFD_FLAG_NO_DIRECT_MAP. +The first patch teaches gfn_to_pfn_cache to obtain pfn for guest_memfd- +backed memslots via kvm_gmem_get_pfn() instead of GUP (hva_to_pfn()). +The second patch makes gfn_to_pfn_cache use vmap()/vunmap() to create a +fault-free kernel address for such pages. We believe that establishing +such mapping for paravirtual guest/host communication is acceptable +since such pages do not contain sensitive data. + +Another considered idea was to use memremap() instead of vmap(), since +gpc_map() already falls back to memremap() if pfn_valid() is false. +However, vmap() was chosen for the following reason. memremap() with +MEMREMAP_WB first attempts to use the direct map via try_ram_remap(), +and then falls back to arch_memremap_wb(), which explicitly refuses to +map system RAM. It would be possible to relax this restriction, but the +side effects are unclear because memremap() is widely used throughout +the kernel. Changing memremap() to support system RAM without the +direct map solely for gfn_to_pfn_cache feels disproportionate. If +additional users appear that need to map system RAM without the direct +map, revisiting and generalizing memremap() might make sense. For now, +vmap()/vunmap() provides a contained and predictable solution. + +A possible approach in the future is to use the "ephmap" (or proclocal) +proposed in [2], but it is not yet clear when that work will be merged. +In constrast, the changes in this patch series are small and +self-contained, yet immediately allow gfn_to_pfn_cache (including +kvm-clock) to operate correctly with direct map-removed guest_memfd. +Once ephmap eventually is merged, gfn_to_pfn_cache can be updated to +make use of it as appropriate. + +[1]: https://lore.kernel.org/all/20250924151101.2225820-1-patrick.roy@campus.lmu.de/ +[2]: https://lore.kernel.org/all/20250812173109.295750-1-jackmanb@google.com/ + +Takahiro Itazuri (2): + KVM: pfncache: Use kvm_gmem_get_pfn() for guest_memfd-backed memslots + KVM: pfncache: Use vmap() for guest_memfd pages without direct map + + include/linux/kvm_host.h | 7 ++++++ + virt/kvm/pfncache.c | 52 +++++++++++++++++++++++++++++----------- + 2 files changed, 45 insertions(+), 14 deletions(-) + +-- +2.50.1 + diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch new file mode 100644 index 00000000000..cec20a80cc4 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-pfncache-Use-kvm_gmem_get_pfn-for-guest_memfd-ba.patch @@ -0,0 +1,99 @@ +From bebfd5914aae9d2eaec6b90b6408875f2aa40610 Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Mon, 1 Dec 2025 14:58:44 +0000 +Subject: [RFC PATCH 1/2] KVM: pfncache: Use kvm_gmem_get_pfn() for guest_memfd-backed memslots + +gfn_to_pfn_cache currently relies on hva_to_pfn(), which resolves PFNs +through GUP. GUP assumes that the page has a valid direct-map PTE, +which is not true for guest_memfd created with +GUEST_MEMFD_FLAG_NO_DIRECT_MAP, because their direct-map PTEs are +explicitly removed via set_direct_map_valid_noflush(). + +Introduce a helper function, gpc_to_pfn(), that routes PFN lookup to +kvm_gmem_get_pfn() for guest_memfd-backed memslots (regardless of +whether GUEST_MEMFD_FLAG_NO_DIRECT_MAP is set), and otherwise falls +back to the existing hva_to_pfn() path. Rename hva_to_pfn_retry() to +gpc_to_pfn_retry() accordingly. + +Signed-off-by: Takahiro Itazuri +--- + virt/kvm/pfncache.c | 34 +++++++++++++++++++++++----------- + 1 file changed, 23 insertions(+), 11 deletions(-) + +diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c +index 728d2c1b488a..bf8d6090e283 100644 +--- a/virt/kvm/pfncache.c ++++ b/virt/kvm/pfncache.c +@@ -152,22 +152,34 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s + return kvm->mmu_invalidate_seq != mmu_seq; + } + +-static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) ++static kvm_pfn_t gpc_to_pfn(struct gfn_to_pfn_cache *gpc, struct page **page) + { +- /* Note, the new page offset may be different than the old! */ +- void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); +- kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; +- void *new_khva = NULL; +- unsigned long mmu_seq; +- struct page *page; ++ if (kvm_slot_has_gmem(gpc->memslot)) { ++ kvm_pfn_t pfn; ++ ++ kvm_gmem_get_pfn(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa), ++ &pfn, page, NULL); ++ return pfn; ++ } + + struct kvm_follow_pfn kfp = { + .slot = gpc->memslot, + .gfn = gpa_to_gfn(gpc->gpa), + .flags = FOLL_WRITE, + .hva = gpc->uhva, +- .refcounted_page = &page, ++ .refcounted_page = page, + }; ++ return hva_to_pfn(&kfp); ++} ++ ++static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) ++{ ++ /* Note, the new page offset may be different than the old! */ ++ void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); ++ kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; ++ void *new_khva = NULL; ++ unsigned long mmu_seq; ++ struct page *page; + + lockdep_assert_held(&gpc->refresh_lock); + +@@ -206,7 +218,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) + cond_resched(); + } + +- new_pfn = hva_to_pfn(&kfp); ++ new_pfn = gpc_to_pfn(gpc, &page); + if (is_error_noslot_pfn(new_pfn)) + goto out_error; + +@@ -319,7 +331,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l + } + } + +- /* Note: the offset must be correct before calling hva_to_pfn_retry() */ ++ /* Note: the offset must be correct before calling gpc_to_pfn_retry() */ + gpc->uhva += page_offset; + + /* +@@ -327,7 +339,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l + * drop the lock and do the HVA to PFN lookup again. + */ + if (!gpc->valid || hva_change) { +- ret = hva_to_pfn_retry(gpc); ++ ret = gpc_to_pfn_retry(gpc); + } else { + /* + * If the HVA→PFN mapping was already valid, don't unmap it. +-- +2.50.1 + diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch deleted file mode 100644 index 755f1c0c73c..00000000000 --- a/resources/hiding_ci/linux_patches/11-kvm-clock/0001-KVM-x86-use-uhva-for-kvm-clock-if-kvm_gpc_refresh-fa.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0a04094c8b7e292fcb7bdf8528d70baddbfff379 Mon Sep 17 00:00:00 2001 -From: Patrick Roy -Date: Fri, 18 Jul 2025 15:59:39 +0100 -Subject: [PATCH 01/15] KVM: x86: use uhva for kvm-clock if kvm_gpc_refresh() - fails - -kvm-clock uses a gfn_to_pfn_cache to avoid repeated gpa->pfn -computations, relying on mmu notifiers to determine when the translation -needs to be redone. - -If the guest places the kvm-clock for some vcpu into memory that is -backed by a KVM_MEMSLOT_GMEM_ONLY memslot, and the guest_memfd instance -has GUEST_MEMFD_FLAG_NO_DIRECT_MAP set, this does not work: -gfn_to_pfn_cache internally uses GUP to resolve uhva->pfn, which -returned -EFAULT for direct map removed memory. But even if this pfn -computation were to work, the subsequent attempts to access guest memory -through the direct map would obviously fail. - -For this scenario, all other parts of kvm fall back to instead accessing -guest memory through userspace mapping of guest_memfd, which is stored -in the memslots userspace_addr. Have kvm-clock do the same by handling -failures in kvm_gpc_refresh() with a fallback to a pvclock update -routine that operates on userspace mappings. This looses the -optimization of gfn_to_pfn_cache for these VMs, but on modern hardawre -kvm-clock update requests should be rare enough for this to not matter -(and guest_memfd is not support for Xen VMs, where speed of pvclock -accesses is more relevant). - -Alternatively, it would be possible to team gfn_to_pfn_cache about -(direct map removed) guest_memfd, however the combination of on-demand -direct map reinsertion (and its induced ref-counting) and hooking -gfn_to_pfn_caches up to gmem invalidations has proven significantly more -complex [1], and hence simply falling back to userspace mappings was -suggested by Sean at one of the guest_memfd upstream calls. - -[1]: https://lore.kernel.org/kvm/20240910163038.1298452-9-roypat@amazon.co.uk/ - https://lore.kernel.org/kvm/20240910163038.1298452-10-roypat@amazon.co.uk/ - -Signed-off-by: Patrick Roy ---- - arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++++++++++++++++- - 1 file changed, 37 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index 33fba801b205..c8fd35c1bbda 100644 ---- a/arch/x86/kvm/x86.c -+++ b/arch/x86/kvm/x86.c -@@ -3149,6 +3149,40 @@ u64 get_kvmclock_ns(struct kvm *kvm) - return data.clock; - } - -+static void kvm_setup_guest_pvclock_slow(struct pvclock_vcpu_time_info *ref_hv_clock, -+ struct kvm_vcpu *vcpu, -+ gpa_t gpa) -+{ -+ struct pvclock_vcpu_time_info guest_hv_clock; -+ struct pvclock_vcpu_time_info hv_clock; -+ -+ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); -+ -+ kvm_read_guest(vcpu->kvm, gpa, &guest_hv_clock, sizeof(struct pvclock_vcpu_time_info)); -+ -+ /* -+ * This VCPU is paused, but it's legal for a guest to read another -+ * VCPU's kvmclock, so we really have to follow the specification where -+ * it says that version is odd if data is being modified, and even after -+ * it is consistent. -+ */ -+ -+ guest_hv_clock.version = hv_clock.version = (guest_hv_clock.version + 1) | 1; -+ smp_wmb(); -+ -+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ -+ hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); -+ -+ kvm_write_guest(vcpu->kvm, gpa, &hv_clock, sizeof(struct pvclock_vcpu_time_info)); -+ -+ smp_wmb(); -+ -+ ++hv_clock.version; -+ kvm_write_guest(vcpu->kvm, gpa + offsetof(struct pvclock_vcpu_time_info, version), &hv_clock.version, sizeof(hv_clock.version)); -+ -+ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); -+} -+ - static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, - struct kvm_vcpu *vcpu, - struct gfn_to_pfn_cache *gpc, -@@ -3164,8 +3198,10 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, - while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { - read_unlock_irqrestore(&gpc->lock, flags); - -- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) -+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) { -+ kvm_setup_guest_pvclock_slow(ref_hv_clock, vcpu, gpc->gpa + offset); - return; -+ } - - read_lock_irqsave(&gpc->lock, flags); - } --- -2.51.0 - diff --git a/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch b/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch new file mode 100644 index 00000000000..bc08c508086 --- /dev/null +++ b/resources/hiding_ci/linux_patches/11-kvm-clock/0002-KVM-pfncache-Use-vmap-for-guest_memfd-pages-without-.patch @@ -0,0 +1,115 @@ +From 6c90c75b15ba48cc5e0a1e74224c041c3c45668b Mon Sep 17 00:00:00 2001 +From: Takahiro Itazuri +Date: Mon, 1 Dec 2025 16:47:05 +0000 +Subject: [RFC PATCH 2/2] KVM: pfncache: Use vmap() for guest_memfd pages without direct map + +gfn_to_pfn_cache currently maps RAM PFNs with kmap(), which relies on +the direct map. guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP +disable their direct-map PTEs via set_direct_map_valid_noflush(), so the +linear address returned by kmap()/page_address() will fault if +dereferenced. + +In some cases, gfn_to_pfn_cache dereferences the cached kernel address +(khva) from atomic contexts where page faults cannot be tolerated. +Therefore khva must always refer to a fault-free kernel mapping. Since +mapping and unmapping happen exclusively in the refresh path, which may +sleep, using vmap()/vunmap() for these pages is safe and sufficient. + +Introduce kvm_slot_no_direct_map() to detect guest_memfd slots without +the direct map, and make gpc_map()/gpc_unmap() use vmap()/vunmap() for +such pages. + +This allows the facilities based on gfn_to_pfn_cache (e.g. kvm-clock) to +work correctly with guest_memfd regardless of whether its direct-map +PTEs are valid. + +Signed-off-by: Takahiro Itazuri +--- + include/linux/kvm_host.h | 7 +++++++ + virt/kvm/pfncache.c | 26 ++++++++++++++++++++------ + 2 files changed, 27 insertions(+), 6 deletions(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 70e6a5210ceb..793d98f97928 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -628,6 +629,12 @@ static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *sl + return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; + } + ++static inline bool kvm_slot_no_direct_map(const struct kvm_memory_slot *slot) ++{ ++ return slot && kvm_slot_has_gmem(slot) && ++ mapping_no_direct_map(slot->gmem.file->f_mapping); ++} ++ + static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) + { + return ALIGN(memslot->npages, BITS_PER_LONG) / 8; +diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c +index bf8d6090e283..ae6d8699e536 100644 +--- a/virt/kvm/pfncache.c ++++ b/virt/kvm/pfncache.c +@@ -96,10 +96,16 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) + return true; + } + +-static void *gpc_map(kvm_pfn_t pfn) ++static void *gpc_map(struct gfn_to_pfn_cache *gpc, kvm_pfn_t pfn) + { +- if (pfn_valid(pfn)) +- return kmap(pfn_to_page(pfn)); ++ if (pfn_valid(pfn)) { ++ struct page *page = pfn_to_page(pfn); ++ ++ if (kvm_slot_no_direct_map(gpc->memslot)) ++ return vmap(&page, 1, VM_MAP, PAGE_KERNEL); ++ ++ return kmap(page); ++ } + + #ifdef CONFIG_HAS_IOMEM + return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); +@@ -115,6 +121,11 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) + return; + + if (pfn_valid(pfn)) { ++ if (is_vmalloc_addr(khva)) { ++ vunmap(khva); ++ return; ++ } ++ + kunmap(pfn_to_page(pfn)); + return; + } +@@ -224,13 +235,16 @@ static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) + + /* + * Obtain a new kernel mapping if KVM itself will access the +- * pfn. Note, kmap() and memremap() can both sleep, so this +- * too must be done outside of gpc->lock! ++ * pfn. Note, kmap(), vmap() and memremap() can sleep, so this ++ * too must be done outside of gpc->lock! Note that even though ++ * the rwlock is dropped, it's still fine to access gpc->pfn and ++ * other fields because gpc->fresh_lock mutex prevents those ++ * from being changed. + */ + if (new_pfn == gpc->pfn) + new_khva = old_khva; + else +- new_khva = gpc_map(new_pfn); ++ new_khva = gpc_map(gpc, new_pfn); + + if (!new_khva) { + kvm_release_page_unused(page); +-- +2.50.1 +