Skip to content

Commit c0213b0

Browse files
Daniel Bristot de OliveiraIngo Molnar
authored andcommitted
x86/alternative: Batch of patch operations
Currently, the patch of an address is done in three steps: -- Pseudo-code #1 - Current implementation --- 1) add an int3 trap to the address that will be patched sync cores (send IPI to all other CPUs) 2) update all but the first byte of the patched range sync cores (send IPI to all other CPUs) 3) replace the first byte (int3) by the first byte of replacing opcode sync cores (send IPI to all other CPUs) -- Pseudo-code #1 --- When a static key has more than one entry, these steps are called once for each entry. The number of IPIs then is linear with regard to the number 'n' of entries of a key: O(n*3), which is O(n). This algorithm works fine for the update of a single key. But we think it is possible to optimize the case in which a static key has more than one entry. For instance, the sched_schedstats jump label has 56 entries in my (updated) fedora kernel, resulting in 168 IPIs for each CPU in which the thread that is enabling the key is _not_ running. With this patch, rather than receiving a single patch to be processed, a vector of patches is passed, enabling the rewrite of the pseudo-code #1 in this way: -- Pseudo-code #2 - This patch --- 1) for each patch in the vector: add an int3 trap to the address that will be patched sync cores (send IPI to all other CPUs) 2) for each patch in the vector: update all but the first byte of the patched range sync cores (send IPI to all other CPUs) 3) for each patch in the vector: replace the first byte (int3) by the first byte of replacing opcode sync cores (send IPI to all other CPUs) -- Pseudo-code #2 - This patch --- Doing the update in this way, the number of IPI becomes O(3) with regard to the number of keys, which is O(1). The batch mode is done with the function text_poke_bp_batch(), that receives two arguments: a vector of "struct text_to_poke", and the number of entries in the vector. The vector must be sorted by the addr field of the text_to_poke structure, enabling the binary search of a handler in the poke_int3_handler function (a fast path). Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Chris von Recklinghausen <crecklin@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jason Baron <jbaron@akamai.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Scott Wood <swood@redhat.com> Cc: Steven Rostedt (VMware) <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/ca506ed52584c80f64de23f6f55ca288e5d079de.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 0f13302 commit c0213b0

File tree

2 files changed

+135
-34
lines changed

2 files changed

+135
-34
lines changed

arch/x86/include/asm/text-patching.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
1818
#define __parainstructions_end NULL
1919
#endif
2020

21+
/*
22+
* Currently, the max observed size in the kernel code is
23+
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
24+
* Raise it if needed.
25+
*/
26+
#define POKE_MAX_OPCODE_SIZE 5
27+
28+
struct text_poke_loc {
29+
void *detour;
30+
void *addr;
31+
size_t len;
32+
const char opcode[POKE_MAX_OPCODE_SIZE];
33+
};
34+
2135
extern void text_poke_early(void *addr, const void *opcode, size_t len);
2236

2337
/*
@@ -38,6 +52,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
3852
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
3953
extern int poke_int3_handler(struct pt_regs *regs);
4054
extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
55+
extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
4156
extern int after_bootmem;
4257
extern __ro_after_init struct mm_struct *poking_mm;
4358
extern __ro_after_init unsigned long poking_addr;

arch/x86/kernel/alternative.c

Lines changed: 120 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <linux/kdebug.h>
1515
#include <linux/kprobes.h>
1616
#include <linux/mmu_context.h>
17+
#include <linux/bsearch.h>
1718
#include <asm/text-patching.h>
1819
#include <asm/alternative.h>
1920
#include <asm/sections.h>
@@ -848,81 +849,133 @@ static void do_sync_core(void *info)
848849
sync_core();
849850
}
850851

851-
static bool bp_patching_in_progress;
852-
static void *bp_int3_handler, *bp_int3_addr;
852+
static struct bp_patching_desc {
853+
struct text_poke_loc *vec;
854+
int nr_entries;
855+
} bp_patching;
856+
857+
static int patch_cmp(const void *key, const void *elt)
858+
{
859+
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
860+
861+
if (key < tp->addr)
862+
return -1;
863+
if (key > tp->addr)
864+
return 1;
865+
return 0;
866+
}
867+
NOKPROBE_SYMBOL(patch_cmp);
853868

854869
int poke_int3_handler(struct pt_regs *regs)
855870
{
871+
struct text_poke_loc *tp;
872+
unsigned char int3 = 0xcc;
873+
void *ip;
874+
856875
/*
857876
* Having observed our INT3 instruction, we now must observe
858-
* bp_patching_in_progress.
877+
* bp_patching.nr_entries.
859878
*
860-
* in_progress = TRUE INT3
879+
* nr_entries != 0 INT3
861880
* WMB RMB
862-
* write INT3 if (in_progress)
881+
* write INT3 if (nr_entries)
863882
*
864-
* Idem for bp_int3_handler.
883+
* Idem for other elements in bp_patching.
865884
*/
866885
smp_rmb();
867886

868-
if (likely(!bp_patching_in_progress))
887+
if (likely(!bp_patching.nr_entries))
869888
return 0;
870889

871-
if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
890+
if (user_mode(regs))
872891
return 0;
873892

874-
/* set up the specified breakpoint handler */
875-
regs->ip = (unsigned long) bp_int3_handler;
893+
/*
894+
* Discount the sizeof(int3). See text_poke_bp_batch().
895+
*/
896+
ip = (void *) regs->ip - sizeof(int3);
897+
898+
/*
899+
* Skip the binary search if there is a single member in the vector.
900+
*/
901+
if (unlikely(bp_patching.nr_entries > 1)) {
902+
tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
903+
sizeof(struct text_poke_loc),
904+
patch_cmp);
905+
if (!tp)
906+
return 0;
907+
} else {
908+
tp = bp_patching.vec;
909+
if (tp->addr != ip)
910+
return 0;
911+
}
912+
913+
/* set up the specified breakpoint detour */
914+
regs->ip = (unsigned long) tp->detour;
876915

877916
return 1;
878917
}
879918
NOKPROBE_SYMBOL(poke_int3_handler);
880919

881920
/**
882-
* text_poke_bp() -- update instructions on live kernel on SMP
883-
* @addr: address to patch
884-
* @opcode: opcode of new instruction
885-
* @len: length to copy
886-
* @handler: address to jump to when the temporary breakpoint is hit
921+
* text_poke_bp_batch() -- update instructions on live kernel on SMP
922+
* @tp: vector of instructions to patch
923+
* @nr_entries: number of entries in the vector
887924
*
888925
* Modify multi-byte instruction by using int3 breakpoint on SMP.
889926
* We completely avoid stop_machine() here, and achieve the
890927
* synchronization using int3 breakpoint.
891928
*
892929
* The way it is done:
893-
* - add a int3 trap to the address that will be patched
930+
* - For each entry in the vector:
931+
* - add a int3 trap to the address that will be patched
894932
* - sync cores
895-
* - update all but the first byte of the patched range
933+
* - For each entry in the vector:
934+
* - update all but the first byte of the patched range
896935
* - sync cores
897-
* - replace the first byte (int3) by the first byte of
898-
* replacing opcode
936+
* - For each entry in the vector:
937+
* - replace the first byte (int3) by the first byte of
938+
* replacing opcode
899939
* - sync cores
900940
*/
901-
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
941+
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
902942
{
943+
int patched_all_but_first = 0;
903944
unsigned char int3 = 0xcc;
904-
905-
bp_int3_handler = handler;
906-
bp_int3_addr = (u8 *)addr + sizeof(int3);
907-
bp_patching_in_progress = true;
945+
unsigned int i;
908946

909947
lockdep_assert_held(&text_mutex);
910948

949+
bp_patching.vec = tp;
950+
bp_patching.nr_entries = nr_entries;
951+
911952
/*
912953
* Corresponding read barrier in int3 notifier for making sure the
913-
* in_progress and handler are correctly ordered wrt. patching.
954+
* nr_entries and handler are correctly ordered wrt. patching.
914955
*/
915956
smp_wmb();
916957

917-
text_poke(addr, &int3, sizeof(int3));
958+
/*
959+
* First step: add a int3 trap to the address that will be patched.
960+
*/
961+
for (i = 0; i < nr_entries; i++)
962+
text_poke(tp[i].addr, &int3, sizeof(int3));
918963

919964
on_each_cpu(do_sync_core, NULL, 1);
920965

921-
if (len - sizeof(int3) > 0) {
922-
/* patch all but the first byte */
923-
text_poke((char *)addr + sizeof(int3),
924-
(const char *) opcode + sizeof(int3),
925-
len - sizeof(int3));
966+
/*
967+
* Second step: update all but the first byte of the patched range.
968+
*/
969+
for (i = 0; i < nr_entries; i++) {
970+
if (tp[i].len - sizeof(int3) > 0) {
971+
text_poke((char *)tp[i].addr + sizeof(int3),
972+
(const char *)tp[i].opcode + sizeof(int3),
973+
tp[i].len - sizeof(int3));
974+
patched_all_but_first++;
975+
}
976+
}
977+
978+
if (patched_all_but_first) {
926979
/*
927980
* According to Intel, this core syncing is very likely
928981
* not necessary and we'd be safe even without it. But
@@ -931,14 +984,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
931984
on_each_cpu(do_sync_core, NULL, 1);
932985
}
933986

934-
/* patch the first byte */
935-
text_poke(addr, opcode, sizeof(int3));
987+
/*
988+
* Third step: replace the first byte (int3) by the first byte of
989+
* replacing opcode.
990+
*/
991+
for (i = 0; i < nr_entries; i++)
992+
text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
936993

937994
on_each_cpu(do_sync_core, NULL, 1);
938995
/*
939996
* sync_core() implies an smp_mb() and orders this store against
940997
* the writing of the new instruction.
941998
*/
942-
bp_patching_in_progress = false;
999+
bp_patching.vec = NULL;
1000+
bp_patching.nr_entries = 0;
9431001
}
9441002

1003+
/**
1004+
* text_poke_bp() -- update instructions on live kernel on SMP
1005+
* @addr: address to patch
1006+
* @opcode: opcode of new instruction
1007+
* @len: length to copy
1008+
* @handler: address to jump to when the temporary breakpoint is hit
1009+
*
1010+
* Update a single instruction with the vector in the stack, avoiding
1011+
* dynamically allocated memory. This function should be used when it is
1012+
* not possible to allocate memory.
1013+
*/
1014+
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
1015+
{
1016+
struct text_poke_loc tp = {
1017+
.detour = handler,
1018+
.addr = addr,
1019+
.len = len,
1020+
};
1021+
1022+
if (len > POKE_MAX_OPCODE_SIZE) {
1023+
WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
1024+
return;
1025+
}
1026+
1027+
memcpy((void *)tp.opcode, opcode, len);
1028+
1029+
text_poke_bp_batch(&tp, 1);
1030+
}

0 commit comments

Comments
 (0)