Skip to content

Commit 4b5bc2e

Browse files
dwmw2Ingo Molnar
authored andcommitted
x86/kexec: Allocate PGD for x86_64 transition page tables separately
Now that the following fix: d0ceea6 ("x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables") stops kernel_ident_mapping_init() from scribbling over the end of a 4KiB PGD by assuming the following 4KiB will be a userspace PGD, there's no good reason for the kexec PGD to be part of a single 8KiB allocation with the control_code_page. ( It's not clear that that was the reason for x86_64 kexec doing it that way in the first place either; there were no comments to that effect and it seems to have been the case even before PTI came along. It looks like it was just a happy accident which prevented memory corruption on kexec. ) Either way, it definitely isn't needed now. Just allocate the PGD separately on x86_64, like i386 already does. Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Baoquan He <bhe@redhat.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Young <dyoung@redhat.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Link: https://lore.kernel.org/r/20241205153343.3275139-6-dwmw2@infradead.org
1 parent 9e5683e commit 4b5bc2e

File tree

2 files changed

+38
-25
lines changed

2 files changed

+38
-25
lines changed

arch/x86/include/asm/kexec.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# define PAGES_NR 4
1717
#endif
1818

19+
# define KEXEC_CONTROL_PAGE_SIZE 4096
1920
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
2021

2122
#ifndef __ASSEMBLY__
@@ -43,7 +44,6 @@ struct kimage;
4344
/* Maximum address we can use for the control code buffer */
4445
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
4546

46-
# define KEXEC_CONTROL_PAGE_SIZE 4096
4747

4848
/* The native architecture */
4949
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -58,9 +58,6 @@ struct kimage;
5858
/* Maximum address we can use for the control pages */
5959
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
6060

61-
/* Allocate one page for the pdp and the second for the code */
62-
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
63-
6461
/* The native architecture */
6562
# define KEXEC_ARCH KEXEC_ARCH_X86_64
6663
#endif
@@ -145,6 +142,19 @@ struct kimage_arch {
145142
};
146143
#else
147144
struct kimage_arch {
145+
/*
146+
* This is a kimage control page, as it must not overlap with either
147+
* source or destination address ranges.
148+
*/
149+
pgd_t *pgd;
150+
/*
151+
* The virtual mapping of the control code page itself is used only
152+
* during the transition, while the current kernel's pages are all
153+
* in place. Thus the intermediate page table pages used to map it
154+
* are not control pages, but instead just normal pages obtained
155+
* with get_zeroed_page(). And have to be tracked (below) so that
156+
* they can be freed.
157+
*/
148158
p4d_t *p4d;
149159
pud_t *pud;
150160
pmd_t *pmd;

arch/x86/kernel/machine_kexec_64.c

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image)
146146
image->arch.pte = NULL;
147147
}
148148

149-
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
149+
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
150+
unsigned long control_page)
150151
{
151152
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
152153
unsigned long vaddr, paddr;
@@ -157,7 +158,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
157158
pte_t *pte;
158159

159160
vaddr = (unsigned long)relocate_kernel;
160-
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
161+
paddr = control_page;
161162
pgd += pgd_index(vaddr);
162163
if (!pgd_present(*pgd)) {
163164
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -216,7 +217,7 @@ static void *alloc_pgt_page(void *data)
216217
return p;
217218
}
218219

219-
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
220+
static int init_pgtable(struct kimage *image, unsigned long control_page)
220221
{
221222
struct x86_mapping_info info = {
222223
.alloc_pgt_page = alloc_pgt_page,
@@ -225,12 +226,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
225226
.kernpg_flag = _KERNPG_TABLE_NOENC,
226227
};
227228
unsigned long mstart, mend;
228-
pgd_t *level4p;
229229
int result;
230230
int i;
231231

232-
level4p = (pgd_t *)__va(start_pgtable);
233-
clear_page(level4p);
232+
image->arch.pgd = alloc_pgt_page(image);
233+
if (!image->arch.pgd)
234+
return -ENOMEM;
234235

235236
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
236237
info.page_flag |= _PAGE_ENC;
@@ -244,8 +245,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
244245
mstart = pfn_mapped[i].start << PAGE_SHIFT;
245246
mend = pfn_mapped[i].end << PAGE_SHIFT;
246247

247-
result = kernel_ident_mapping_init(&info,
248-
level4p, mstart, mend);
248+
result = kernel_ident_mapping_init(&info, image->arch.pgd,
249+
mstart, mend);
249250
if (result)
250251
return result;
251252
}
@@ -260,8 +261,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
260261
mstart = image->segment[i].mem;
261262
mend = mstart + image->segment[i].memsz;
262263

263-
result = kernel_ident_mapping_init(&info,
264-
level4p, mstart, mend);
264+
result = kernel_ident_mapping_init(&info, image->arch.pgd,
265+
mstart, mend);
265266

266267
if (result)
267268
return result;
@@ -271,15 +272,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
271272
* Prepare EFI systab and ACPI tables for kexec kernel since they are
272273
* not covered by pfn_mapped.
273274
*/
274-
result = map_efi_systab(&info, level4p);
275+
result = map_efi_systab(&info, image->arch.pgd);
275276
if (result)
276277
return result;
277278

278-
result = map_acpi_tables(&info, level4p);
279+
result = map_acpi_tables(&info, image->arch.pgd);
279280
if (result)
280281
return result;
281282

282-
return init_transition_pgtable(image, level4p);
283+
/*
284+
* This must be last because the intermediate page table pages it
285+
* allocates will not be control pages and may overlap the image.
286+
*/
287+
return init_transition_pgtable(image, image->arch.pgd, control_page);
283288
}
284289

285290
static void load_segments(void)
@@ -296,14 +301,14 @@ static void load_segments(void)
296301

297302
int machine_kexec_prepare(struct kimage *image)
298303
{
299-
unsigned long start_pgtable;
304+
unsigned long control_page;
300305
int result;
301306

302307
/* Calculate the offsets */
303-
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
308+
control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
304309

305310
/* Setup the identity mapped 64bit page table */
306-
result = init_pgtable(image, start_pgtable);
311+
result = init_pgtable(image, control_page);
307312
if (result)
308313
return result;
309314

@@ -357,13 +362,12 @@ void machine_kexec(struct kimage *image)
357362
#endif
358363
}
359364

360-
control_page = page_address(image->control_code_page) + PAGE_SIZE;
365+
control_page = page_address(image->control_code_page);
361366
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
362367

363368
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
364369
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
365-
page_list[PA_TABLE_PAGE] =
366-
(unsigned long)__pa(page_address(image->control_code_page));
370+
page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd);
367371

368372
if (image->type == KEXEC_TYPE_DEFAULT)
369373
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
@@ -573,8 +577,7 @@ static void kexec_mark_crashkres(bool protect)
573577

574578
/* Don't touch the control code page used in crash_kexec().*/
575579
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
576-
/* Control code page is located in the 2nd page. */
577-
kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
580+
kexec_mark_range(crashk_res.start, control - 1, protect);
578581
control += KEXEC_CONTROL_PAGE_SIZE;
579582
kexec_mark_range(control, crashk_res.end, protect);
580583
}

0 commit comments

Comments
 (0)