From dc43978aa57835b1d9cab54dafdf28b02a9440ea Mon Sep 17 00:00:00 2001
From: Konstantin Belousov <kib@FreeBSD.org>
Date: Tue, 14 Jul 2020 20:37:50 +0000
Subject: [PATCH] amd64: allow parallel shootdown IPIs

Stop using smp_ipi_mtx to protect global shootdown state, and
move/multiply the global state into pcpu.  Now each CPU can initiate
shootdown IPI independently from other CPUs.  Initiator enters
critical section, then fills its local PCPU shootdown info
(pc_smp_tlb_XXX), then clears scoreboard generation at location (cpu,
my_cpuid) for each target cpu.  After that IPI is sent to all targets
which scan for zeroed scoreboard generation words.  Upon finding such
word the shootdown data is read from corresponding cpu' pcpu, and
generation is set.  Meantime initiator loops waiting for all zeroed
generations in scoreboard to update.

Initiator does not disable interrupts, which should allow
non-invalidation IPIs from deadlocking, it only needs to disable
preemption to pin itself to the instance of the pcpu smp_tlb data.

The generation is set before the actual invalidation is performed in
handler. It is safe because target CPU cannot return to userspace
before handler finishes. In principle only NMI can preempt the
handler, but NMI would see the kernel handler frame and not touch
not-invalidated user page table.

Handlers loop until they do not see zeroed scoreboard generations.
This, together with hardware keeping one pending IPI in LAPIC IRR
should prevent lost shootdowns.

Notes.
1. The code does protect writes to LAPIC ICR with exclusion. I believe
   this is fine because we in fact do not send IPIs from interrupt
   handlers. More for !x2APIC mode where ICR access for write requires
   two registers write, we disable interrupts around it. If considered
   incorrect, I can add per-cpu spinlock around ipi_send().
2. Scoreboard lines owned by given target CPU can be padded to the
   cache line, to reduce ping-pong.

Reviewed by:	markj (previous version)
Discussed with:	alc
Tested by:	pho
Sponsored by:	The FreeBSD Foundation
MFC after:	3 weeks
Differential revision:	https://reviews.freebsd.org/D25510
---
 sys/amd64/amd64/apic_vector.S  |  60 +---
 sys/amd64/amd64/db_interface.c |   1 -
 sys/amd64/amd64/machdep.c      |   1 +
 sys/amd64/amd64/mp_machdep.c   | 490 ++++++++++++++++++++++++++++-----
 sys/amd64/include/pcpu.h       |   9 +-
 sys/amd64/include/smp.h        |  26 +-
 sys/i386/i386/mp_machdep.c     | 220 +++++++++++++++
 sys/i386/include/smp.h         |  11 +
 sys/x86/include/apicvar.h      |   3 +-
 sys/x86/include/x86_smp.h      |   8 -
 sys/x86/x86/mp_x86.c           | 235 ----------------
 sys/x86/xen/xen_apic.c         |  82 +-----
 12 files changed, 689 insertions(+), 457 deletions(-)

diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 114d52e5a8f213..8d73717b03b33b 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -171,63 +171,13 @@ IDTVEC(spuriousint)
 	.text
 
 	SUPERALIGN_TEXT
-invltlb_ret:
-	call	as_lapic_eoi
-	jmp	ld_regs
-
-	SUPERALIGN_TEXT
-	INTR_HANDLER invltlb
-	call	invltlb_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_pcid
-	call	invltlb_pcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_invpcid_nopti
-	call	invltlb_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_invpcid_pti
-	call	invltlb_invpcid_pti_handler
-	jmp	invltlb_ret
-
-/*
- * Single page TLB shootdown
- */
-	INTR_HANDLER invlpg
-	call	invlpg_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlpg_invpcid
-	call	invlpg_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlpg_pcid
-	call	invlpg_pcid_handler
-	jmp	invltlb_ret
-
-/*
- * Page range TLB shootdown.
- */
-	INTR_HANDLER invlrng
-	call	invlrng_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlrng_invpcid
-	call	invlrng_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlrng_pcid
-	call	invlrng_pcid_handler
-	jmp	invltlb_ret
-
 /*
- * Invalidate cache.
+ * IPI handler for cache and TLB shootdown
  */
-	INTR_HANDLER invlcache
-	call	invlcache_handler
-	jmp	invltlb_ret
+	INTR_HANDLER invlop
+	call	invlop_handler
+	call	as_lapic_eoi
+	jmp	ld_regs
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c
index 4645169af56208..e35248b0766356 100644
--- a/sys/amd64/amd64/db_interface.c
+++ b/sys/amd64/amd64/db_interface.c
@@ -107,5 +107,4 @@ db_show_mdpcpu(struct pcpu *pc)
 	db_printf("gs32p        = %p\n", pc->pc_gs32p);
 	db_printf("ldt          = %p\n", pc->pc_ldt);
 	db_printf("tss          = %p\n", pc->pc_tss);
-	db_printf("tlb gen      = %u\n", pc->pc_smp_tlb_done);
 }
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 4fb846d759485d..1a07080c5dafea 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1562,6 +1562,7 @@ amd64_bsp_pcpu_init1(struct pcpu *pc)
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
+	PCPU_SET(smp_tlb_gen, 1);
 }
 
 void
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 7085a9b1c54041..f5de90484384c3 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
+#include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -202,36 +203,8 @@ cpu_mp_start(void)
 		cpu_apic_ids[i] = -1;
 	}
 
-	/* Install an inter-CPU IPI for TLB invalidation */
-	if (pmap_pcid_enabled) {
-		if (invpcid_works) {
-			setidt(IPI_INVLTLB, pti ?
-			    IDTVEC(invltlb_invpcid_pti_pti) :
-			    IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT,
-			    SEL_KPL, 0);
-			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) :
-			    IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) :
-			    IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0);
-		} else {
-			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
-			    IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) :
-			    IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) :
-			    IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0);
-		}
-	} else {
-		setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
-		    SDT_SYSIGT, SEL_KPL, 0);
-		setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
-		    SDT_SYSIGT, SEL_KPL, 0);
-		setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
-		    SDT_SYSIGT, SEL_KPL, 0);
-	}
-
-	/* Install an inter-CPU IPI for cache invalidation. */
-	setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
+	/* Install an inter-CPU IPI for for cache and TLB invalidations. */
+	setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
@@ -314,6 +287,8 @@ init_secondary(void)
 	pc->pc_pcid_next = PMAP_PCID_KERN + 2;
 	pc->pc_pcid_gen = 1;
 
+	pc->pc_smp_tlb_gen = 1;
+
 	/* Init tss */
 	pc->pc_common_tss = __pcpu[0].pc_common_tss;
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
@@ -542,11 +517,270 @@ start_ap(int apic_id)
 	return 0;		/* return FAILURE */
 }
 
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/*
+ * Invalidation request.  PCPU pc_smp_tlb_op uses u_int instead of the
+ * enum to avoid both namespace and ABI issues (with enums).
+ */
+enum invl_op_codes {
+      INVL_OP_TLB		= 1,
+      INVL_OP_TLB_INVPCID	= 2,
+      INVL_OP_TLB_INVPCID_PTI	= 3,
+      INVL_OP_TLB_PCID		= 4,
+      INVL_OP_PGRNG		= 5,
+      INVL_OP_PGRNG_INVPCID	= 6,
+      INVL_OP_PGRNG_PCID	= 7,
+      INVL_OP_PG		= 8,
+      INVL_OP_PG_INVPCID	= 9,
+      INVL_OP_PG_PCID		= 10,
+      INVL_OP_CACHE		= 11,
+};
+
+/*
+ * These variables are initialized at startup to reflect how each of
+ * the different kinds of invalidations should be performed on the
+ * current machine and environment.
+ */
+static enum invl_op_codes invl_op_tlb;
+static enum invl_op_codes invl_op_pgrng;
+static enum invl_op_codes invl_op_pg;
+
+/*
+ * Scoreboard of IPI completion notifications from target to IPI initiator.
+ *
+ * Each CPU can initiate shootdown IPI independently from other CPUs.
+ * Initiator enters critical section, then fills its local PCPU
+ * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation
+ * at location (cpu, my_cpuid) for each target cpu.  After that IPI is
+ * sent to all targets which scan for zeroed scoreboard generation
+ * words.  Upon finding such word the shootdown data is read from
+ * corresponding cpu' pcpu, and generation is set.  Meantime initiator
+ * loops waiting for all zeroed generations in scoreboard to update.
+ */
+static uint32_t *invl_scoreboard;
+
+static void
+invl_scoreboard_init(void *arg __unused)
+{
+	u_int i;
+
+	invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) *
+	    (mp_maxid + 1), M_DEVBUF, M_WAITOK);
+	for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++)
+		invl_scoreboard[i] = 1;
+
+	if (pmap_pcid_enabled) {
+		if (invpcid_works) {
+			if (pti)
+				invl_op_tlb = INVL_OP_TLB_INVPCID_PTI;
+			else
+				invl_op_tlb = INVL_OP_TLB_INVPCID;
+			invl_op_pgrng = INVL_OP_PGRNG_INVPCID;
+			invl_op_pg = INVL_OP_PG_INVPCID;
+		} else {
+			invl_op_tlb = INVL_OP_TLB_PCID;
+			invl_op_pgrng = INVL_OP_PGRNG_PCID;
+			invl_op_pg = INVL_OP_PG_PCID;
+		}
+	} else {
+		invl_op_tlb = INVL_OP_TLB;
+		invl_op_pgrng = INVL_OP_PGRNG;
+		invl_op_pg = INVL_OP_PG;
+	}
+}
+SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL);
+
+static uint32_t *
+invl_scoreboard_getcpu(u_int cpu)
+{
+	return (invl_scoreboard + cpu * (mp_maxid + 1));
+}
+
+static uint32_t *
+invl_scoreboard_slot(u_int cpu)
+{
+	return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid));
+}
+
+/*
+ * Used by pmap to request cache or TLB invalidation on local and
+ * remote processors.  Mask provides the set of remote CPUs which are
+ * to be signalled with the invalidation IPI.  As an optimization, the
+ * curcpu_cb callback is invoked on the calling CPU while waiting for
+ * remote CPUs to complete the operation.
+ *
+ * The callback function is called unconditionally on the caller's
+ * underlying processor, even when this processor is not set in the
+ * mask.  So, the callback function must be prepared to handle such
+ * spurious invocations.
+ *
+ * Interrupts must be enabled when calling the function with smp
+ * started, to avoid deadlock with other IPIs that are protected with
+ * smp_ipi_mtx spinlock at the initiator side.
+ */
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op)
+{
+	cpuset_t other_cpus, mask1;
+	uint32_t generation, *p_cpudone;
+	int cpu;
+
+	/*
+	 * It is not necessary to signal other CPUs while booting or
+	 * when in the debugger.
+	 */
+	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
+		curcpu_cb(pmap, addr1, addr2);
+		return;
+	}
+
+	sched_pin();
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			goto nospinexit;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			goto nospinexit;
+	}
+
+	/*
+	 * Initiator must have interrupts enabled, which prevents
+	 * non-invalidation IPIs, that takes smp_ipi_mtx spinlock,
+	 * from deadlocking with as.  On the other hand, preemption
+	 * must be disabled to pin initiator to the instance of the
+	 * pcpu pc_smp_tlb data and scoreboard line.
+	 */
+	KASSERT((read_rflags() & PSL_I) != 0,
+	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
+	critical_enter();
+
+	PCPU_SET(smp_tlb_addr1, addr1);
+	PCPU_SET(smp_tlb_addr2, addr2);
+	PCPU_SET(smp_tlb_pmap, pmap);
+	generation = PCPU_GET(smp_tlb_gen);
+	if (++generation == 0)
+		generation = 1;
+	PCPU_SET(smp_tlb_gen, generation);
+	PCPU_SET(smp_tlb_op, op);
+	/* Fence between filling smp_tlb fields and clearing scoreboard. */
+	atomic_thread_fence_rel();
+
+	mask1 = mask;
+	while ((cpu = CPU_FFS(&mask1)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &mask1);
+		KASSERT(*invl_scoreboard_slot(cpu) != 0,
+		    ("IPI scoreboard is zero, initiator %d target %d",
+		    PCPU_GET(cpuid), cpu));
+		*invl_scoreboard_slot(cpu) = 0;
+	}
+
+	/*
+	 * IPI acts as a fence between writing to the scoreboard above
+	 * (zeroing slot) and reading from it below (wait for
+	 * acknowledge).
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(IPI_INVLOP);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d invl ipi op: %x", __func__,
+			    cpu, op);
+			ipi_send_cpu(cpu, IPI_INVLOP);
+		}
+	}
+	curcpu_cb(pmap, addr1, addr2);
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = invl_scoreboard_slot(cpu);
+		while (atomic_load_int(p_cpudone) != generation)
+			ia32_pause();
+	}
+	critical_exit();
+	sched_unpin();
+	return;
+
+nospinexit:
+	curcpu_cb(pmap, addr1, addr2);
+	sched_unpin();
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_global++;
+#endif
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
+    smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_page++;
+#endif
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb,
+	    invl_op_pgrng);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_range++;
+	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+}
+
 void
-invltlb_invpcid_handler(void)
+smp_cache_flush(smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb,
+	    INVL_OP_CACHE);
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+static void
+invltlb_handler(pmap_t smp_tlb_pmap)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	else
+		invltlb();
+}
+
+static void
+invltlb_invpcid_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@@ -555,20 +789,17 @@ invltlb_invpcid_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
 	    INVPCID_CTX);
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invltlb_invpcid_pti_handler(void)
+static void
+invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@@ -577,7 +808,6 @@ invltlb_invpcid_pti_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
@@ -594,14 +824,13 @@ invltlb_invpcid_pti_handler(void)
 		d.pcid |= PMAP_PCID_USER_PT;
 		invpcid(&d, INVPCID_CTX);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invltlb_pcid_handler(void)
+static void
+invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 {
 	uint64_t kcr3, ucr3;
-	uint32_t generation, pcid;
+	uint32_t pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@@ -610,7 +839,6 @@ invltlb_pcid_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	if (smp_tlb_pmap == kernel_pmap) {
 		invltlb_glob();
 	} else {
@@ -632,14 +860,25 @@ invltlb_pcid_handler(void)
 				load_cr3(kcr3);
 		}
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlpg_invpcid_handler(void)
+static void
+invlpg_handler(vm_offset_t smp_tlb_addr1)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	invlpg(smp_tlb_addr1);
+}
+
+static void
+invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
@@ -648,7 +887,6 @@ invlpg_invpcid_handler(void)
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
@@ -657,14 +895,12 @@ invlpg_invpcid_handler(void)
 		d.addr = smp_tlb_addr1;
 		invpcid(&d, INVPCID_ADDR);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlpg_pcid_handler(void)
+static void
+invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	uint64_t kcr3, ucr3;
-	uint32_t generation;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
@@ -674,7 +910,6 @@ invlpg_pcid_handler(void)
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
@@ -683,15 +918,34 @@ invlpg_pcid_handler(void)
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlrng_invpcid_handler(void)
+static void
+invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
+{
+	vm_offset_t addr, addr2;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+}
+
+static void
+invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
+    vm_offset_t smp_tlb_addr2)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr, addr2;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
@@ -702,7 +956,6 @@ invlrng_invpcid_handler(void)
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
@@ -717,15 +970,14 @@ invlrng_invpcid_handler(void)
 			d.addr += PAGE_SIZE;
 		} while (d.addr < addr2);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlrng_pcid_handler(void)
+static void
+invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
+    vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 	uint64_t kcr3, ucr3;
-	uint32_t generation;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
@@ -737,7 +989,6 @@ invlrng_pcid_handler(void)
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
@@ -749,5 +1000,116 @@ invlrng_pcid_handler(void)
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
 	}
-	PCPU_SET(smp_tlb_done, generation);
+}
+
+static void
+invlcache_handler(void)
+{
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+	wbinvd();
+}
+
+static void
+invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap,
+    vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
+{
+	switch (smp_tlb_op) {
+	case INVL_OP_TLB:
+		invltlb_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_INVPCID:
+		invltlb_invpcid_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_INVPCID_PTI:
+		invltlb_invpcid_pti_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_PCID:
+		invltlb_pcid_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_PGRNG:
+		invlrng_handler(smp_tlb_addr1, smp_tlb_addr2);
+		break;
+	case INVL_OP_PGRNG_INVPCID:
+		invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+		break;
+	case INVL_OP_PGRNG_PCID:
+		invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+		break;
+	case INVL_OP_PG:
+		invlpg_handler(smp_tlb_addr1);
+		break;
+	case INVL_OP_PG_INVPCID:
+		invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1);
+		break;
+	case INVL_OP_PG_PCID:
+		invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1);
+		break;
+	case INVL_OP_CACHE:
+		invlcache_handler();
+		break;
+	default:
+		__assert_unreachable();
+		break;
+	}
+}
+
+void
+invlop_handler(void)
+{
+	struct pcpu *initiator_pc;
+	pmap_t smp_tlb_pmap;
+	vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+	u_int initiator_cpu_id;
+	enum invl_op_codes smp_tlb_op;
+	uint32_t *scoreboard, smp_tlb_gen;
+
+	scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid));
+	for (;;) {
+		for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid;
+		    initiator_cpu_id++) {
+			if (scoreboard[initiator_cpu_id] == 0)
+				break;
+		}
+		if (initiator_cpu_id > mp_maxid)
+			break;
+		initiator_pc = cpuid_to_pcpu[initiator_cpu_id];
+
+		/*
+		 * This acquire fence and its corresponding release
+		 * fence in smp_targeted_tlb_shootdown(), is between
+		 * reading zero scoreboard slot and accessing PCPU of
+		 * initiator for pc_smp_tlb values.
+		 */
+		atomic_thread_fence_acq();
+		smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap;
+		smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1;
+		smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2;
+		smp_tlb_op = initiator_pc->pc_smp_tlb_op;
+		smp_tlb_gen = initiator_pc->pc_smp_tlb_gen;
+
+		/*
+		 * Ensure that we do not make our scoreboard
+		 * notification visible to the initiator until the
+		 * pc_smp_tlb values are read.  The corresponding
+		 * fence is implicitly provided by the barrier in the
+		 * IPI send operation before the APIC ICR register
+		 * write.
+		 *
+		 * As an optimization, the request is acknowledged
+		 * before the actual invalidation is performed.  It is
+		 * safe because target CPU cannot return to userspace
+		 * before handler finishes. Only NMI can preempt the
+		 * handler, but NMI would see the kernel handler frame
+		 * and not touch not-invalidated user page table.
+		 */
+		atomic_thread_fence_acq();
+		atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen);
+
+		invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+	}
 }
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index b7b546ed2b6de0..22c6ed40aa20e8 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -85,7 +85,7 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
-	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
+	uint32_t pc_unused;						\
 	uint32_t pc_ibpb_set;						\
 	void	*pc_mds_buf;						\
 	void	*pc_mds_buf64;						\
@@ -94,7 +94,12 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 	u_int 	pc_ipi_bitmap;						\
 	struct amd64tss pc_common_tss;					\
 	struct user_segment_descriptor pc_gdt[NGDT];			\
-	char	__pad[2956]		/* pad to UMA_PCPU_ALLOC_SIZE */
+	void	*pc_smp_tlb_pmap;					\
+	uint64_t pc_smp_tlb_addr1;					\
+	uint64_t pc_smp_tlb_addr2;					\
+	uint32_t pc_smp_tlb_gen;					\
+	u_int	pc_smp_tlb_op;						\
+	char	__pad[2924]		/* pad to UMA_PCPU_ALLOC_SIZE */
 
 #define	PC_DBREG_CMD_NONE	0
 #define	PC_DBREG_CMD_LOAD	1
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 2ecfe62cf9fbf7..d5b5fa9c5b815a 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -29,34 +29,14 @@ extern u_int32_t		mptramp_pagetables;
 inthand_t
 	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
 	IDTVEC(justreturn1_pti),
-	IDTVEC(invltlb_pti),
-	IDTVEC(invltlb_pcid_pti),
-	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
-	IDTVEC(invltlb_invpcid_pti_pti),
-	IDTVEC(invltlb_invpcid_nopti),
-	IDTVEC(invlpg_pti),
-	IDTVEC(invlpg_invpcid_pti),
-	IDTVEC(invlpg_invpcid),
-	IDTVEC(invlpg_pcid_pti),
-	IDTVEC(invlpg_pcid),
-	IDTVEC(invlrng_pti),
-	IDTVEC(invlrng_invpcid_pti),
-	IDTVEC(invlrng_invpcid),
-	IDTVEC(invlrng_pcid_pti),
-	IDTVEC(invlrng_pcid),
-	IDTVEC(invlcache_pti),
+	IDTVEC(invlop_pti),
+	IDTVEC(invlop),
 	IDTVEC(ipi_intr_bitmap_handler_pti),
 	IDTVEC(cpustop_pti),
 	IDTVEC(cpususpend_pti),
 	IDTVEC(rendezvous_pti);
 
-void	invltlb_pcid_handler(void);
-void	invltlb_invpcid_handler(void);
-void	invltlb_invpcid_pti_handler(void);
-void	invlpg_invpcid_handler(void);
-void	invlpg_pcid_handler(void);
-void	invlrng_invpcid_handler(void);
-void	invlrng_pcid_handler(void);
+void	invlop_handler(void);
 int	native_start_all_aps(void);
 void	mp_bootaddress(vm_paddr_t *, unsigned int *);
 
diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c
index 953e34d2962c53..663fd98e5c8aeb 100644
--- a/sys/i386/i386/mp_machdep.c
+++ b/sys/i386/i386/mp_machdep.c
@@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$");
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
+#include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -467,3 +468,222 @@ start_ap(int apic_id)
 	}
 	return 0;		/* return FAILURE */
 }
+
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+pmap_t smp_tlb_pmap;
+volatile uint32_t smp_tlb_generation;
+
+/*
+ * Used by pmap to request cache or TLB invalidation on local and
+ * remote processors.  Mask provides the set of remote CPUs which are
+ * to be signalled with the invalidation IPI, specified by vector.  As
+ * an optimization, the curcpu_cb callback is invoked on the calling
+ * CPU while waiting for remote CPUs to complete the operation.
+ *
+ * The callback function is called unconditionally on the caller's
+ * underlying processor, even when this processor is not set in the
+ * mask.  So, the callback function must be prepared to handle such
+ * spurious invocations.
+ */
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
+{
+	cpuset_t other_cpus;
+	volatile uint32_t *p_cpudone;
+	uint32_t generation;
+	int cpu;
+
+	/*
+	 * It is not necessary to signal other CPUs while booting or
+	 * when in the debugger.
+	 */
+	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
+		curcpu_cb(pmap, addr1, addr2);
+		return;
+	}
+
+	sched_pin();
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			goto nospinexit;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			goto nospinexit;
+	}
+
+	KASSERT((read_eflags() & PSL_I) != 0,
+	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
+	mtx_lock_spin(&smp_ipi_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
+	generation = ++smp_tlb_generation;
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(vector);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+			    cpu, vector);
+			ipi_send_cpu(cpu, vector);
+		}
+	}
+	curcpu_cb(pmap, addr1, addr2);
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+		while (*p_cpudone != generation)
+			ia32_pause();
+	}
+	mtx_unlock_spin(&smp_ipi_mtx);
+	sched_unpin();
+	return;
+
+nospinexit:
+	curcpu_cb(pmap, addr1, addr2);
+	sched_unpin();
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_global++;
+#endif
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
+    smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_page++;
+#endif
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
+	    curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_range++;
+	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+}
+
+void
+smp_cache_flush(smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
+	    curcpu_cb);
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since invalidating the TLB is a serializing operation.
+	 */
+	generation = smp_tlb_generation;
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	if (smp_tlb_pmap == kernel_pmap)
+		invlpg(smp_tlb_addr1);
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	if (smp_tlb_pmap == kernel_pmap) {
+		do {
+			invlpg(addr);
+			addr += PAGE_SIZE;
+		} while (addr < addr2);
+	}
+
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlcache_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since wbinvd is a serializing instruction.  Without the
+	 * temporary, we'd wait for wbinvd to complete, then the read
+	 * would execute, then the dependent write, which must then
+	 * complete before return from interrupt.
+	 */
+	generation = smp_tlb_generation;
+	wbinvd();
+	PCPU_SET(smp_tlb_done, generation);
+}
diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h
index 4fcb55a41996cb..395695d3ecd203 100644
--- a/sys/i386/include/smp.h
+++ b/sys/i386/include/smp.h
@@ -27,9 +27,20 @@
 #include <x86/apicvar.h>
 #include <machine/pcb.h>
 
+inthand_t
+	IDTVEC(invltlb),	/* TLB shootdowns - global */
+	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
+	IDTVEC(invlrng),	/* TLB shootdowns - page range */
+	IDTVEC(invlcache);	/* Write back and invalidate cache */
+
 /* functions in mpboot.s */
 void bootMP(void);
 
+void	invltlb_handler(void);
+void	invlpg_handler(void);
+void	invlrng_handler(void);
+void	invlcache_handler(void);
+
 #endif /* !LOCORE */
 #endif /* SMP */
 
diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h
index de85cf9198fd0b..866dafe6dca484 100644
--- a/sys/x86/include/apicvar.h
+++ b/sys/x86/include/apicvar.h
@@ -112,7 +112,8 @@
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
-#define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
+#define	IPI_INVLOP	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, amd64 */
+#define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, i386 */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h
index 1a0ef8fbcf78b3..d5535a602bcb33 100644
--- a/sys/x86/include/x86_smp.h
+++ b/sys/x86/include/x86_smp.h
@@ -75,10 +75,6 @@ extern u_long *ipi_rendezvous_counts[MAXCPU];
 
 /* IPI handlers */
 inthand_t
-	IDTVEC(invltlb),	/* TLB shootdowns - global */
-	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
-	IDTVEC(invlrng),	/* TLB shootdowns - page range */
-	IDTVEC(invlcache),	/* Write back and invalidate cache */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
 	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
@@ -94,10 +90,6 @@ void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx);
 void	init_secondary_tail(void);
-void	invltlb_handler(void);
-void	invlpg_handler(void);
-void	invlrng_handler(void);
-void	invlcache_handler(void);
 void	init_secondary(void);
 void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 85f27d639b69cc..bc1d211a27fd0f 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1593,28 +1593,6 @@ cpususpend_handler(void)
 	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 }
 
-
-void
-invlcache_handler(void)
-{
-	uint32_t generation;
-
-#ifdef COUNT_IPIS
-	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	/*
-	 * Reading the generation here allows greater parallelism
-	 * since wbinvd is a serializing instruction.  Without the
-	 * temporary, we'd wait for wbinvd to complete, then the read
-	 * would execute, then the dependent write, which must then
-	 * complete before return from interrupt.
-	 */
-	generation = smp_tlb_generation;
-	wbinvd();
-	PCPU_SET(smp_tlb_done, generation);
-}
-
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
@@ -1662,216 +1640,3 @@ mp_ipi_intrcnt(void *dummy)
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
-
-/*
- * Flush the TLB on other CPU's
- */
-
-/* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
-pmap_t smp_tlb_pmap;
-volatile uint32_t smp_tlb_generation;
-
-#ifdef __amd64__
-#define	read_eflags() read_rflags()
-#endif
-
-/*
- * Used by pmap to request invalidation of TLB or cache on local and
- * remote processors.  Mask provides the set of remote CPUs which are
- * to be signalled with the IPI specified by vector.  The curcpu_cb
- * callback is invoked on the calling CPU while waiting for remote
- * CPUs to complete the operation.
- *
- * The callback function is called unconditionally on the caller's
- * underlying processor, even when this processor is not set in the
- * mask.  So, the callback function must be prepared to handle such
- * spurious invocations.
- */
-static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
-    vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
-{
-	cpuset_t other_cpus;
-	volatile uint32_t *p_cpudone;
-	uint32_t generation;
-	int cpu;
-
-	/*
-	 * It is not necessary to signal other CPUs while booting or
-	 * when in the debugger.
-	 */
-	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
-		curcpu_cb(pmap, addr1, addr2);
-		return;
-	}
-
-	sched_pin();
-
-	/*
-	 * Check for other cpus.  Return if none.
-	 */
-	if (CPU_ISFULLSET(&mask)) {
-		if (mp_ncpus <= 1)
-			goto nospinexit;
-	} else {
-		CPU_CLR(PCPU_GET(cpuid), &mask);
-		if (CPU_EMPTY(&mask))
-			goto nospinexit;
-	}
-
-	if (!(read_eflags() & PSL_I))
-		panic("%s: interrupts disabled", __func__);
-	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
-	smp_tlb_addr2 = addr2;
-	smp_tlb_pmap = pmap;
-	generation = ++smp_tlb_generation;
-	if (CPU_ISFULLSET(&mask)) {
-		ipi_all_but_self(vector);
-		other_cpus = all_cpus;
-		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
-	} else {
-		other_cpus = mask;
-		while ((cpu = CPU_FFS(&mask)) != 0) {
-			cpu--;
-			CPU_CLR(cpu, &mask);
-			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
-			    cpu, vector);
-			ipi_send_cpu(cpu, vector);
-		}
-	}
-	curcpu_cb(pmap, addr1, addr2);
-	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
-		cpu--;
-		CPU_CLR(cpu, &other_cpus);
-		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
-		while (*p_cpudone != generation)
-			ia32_pause();
-	}
-	mtx_unlock_spin(&smp_ipi_mtx);
-	sched_unpin();
-	return;
-
-nospinexit:
-	curcpu_cb(pmap, addr1, addr2);
-	sched_unpin();
-}
-
-void
-smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_global++;
-#endif
-}
-
-void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
-    smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_page++;
-#endif
-}
-
-void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
-    pmap_t pmap, smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
-	    curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_range++;
-	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
-#endif
-}
-
-void
-smp_cache_flush(smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
-	    curcpu_cb);
-}
-
-/*
- * Handlers for TLB related IPIs
- */
-void
-invltlb_handler(void)
-{
-	uint32_t generation;
-  
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_gbl[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	/*
-	 * Reading the generation here allows greater parallelism
-	 * since invalidating the TLB is a serializing operation.
-	 */
-	generation = smp_tlb_generation;
-	if (smp_tlb_pmap == kernel_pmap)
-		invltlb_glob();
-#ifdef __amd64__
-	else
-		invltlb();
-#endif
-	PCPU_SET(smp_tlb_done, generation);
-}
-
-void
-invlpg_handler(void)
-{
-	uint32_t generation;
-
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_pg[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	generation = smp_tlb_generation;	/* Overlap with serialization */
-#ifdef __i386__
-	if (smp_tlb_pmap == kernel_pmap)
-#endif
-		invlpg(smp_tlb_addr1);
-	PCPU_SET(smp_tlb_done, generation);
-}
-
-void
-invlrng_handler(void)
-{
-	vm_offset_t addr, addr2;
-	uint32_t generation;
-
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_rng[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	addr = smp_tlb_addr1;
-	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
-#ifdef __i386__
-	if (smp_tlb_pmap == kernel_pmap)
-#endif
-		do {
-			invlpg(addr);
-			addr += PAGE_SIZE;
-		} while (addr < addr2);
-
-	PCPU_SET(smp_tlb_done, generation);
-}
diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c
index 8bf2158dbec272..7d23f0a504173d 100644
--- a/sys/x86/xen/xen_apic.c
+++ b/sys/x86/xen/xen_apic.c
@@ -65,10 +65,14 @@ __FBSDID("$FreeBSD$");
 /*--------------------------- Forward Declarations ---------------------------*/
 #ifdef SMP
 static driver_filter_t xen_smp_rendezvous_action;
+#ifdef __amd64__
+static driver_filter_t xen_invlop;
+#else
 static driver_filter_t xen_invltlb;
 static driver_filter_t xen_invlpg;
 static driver_filter_t xen_invlrng;
 static driver_filter_t xen_invlcache;
+#endif
 static driver_filter_t xen_ipi_bitmap_handler;
 static driver_filter_t xen_cpustop_handler;
 static driver_filter_t xen_cpususpend_handler;
@@ -88,10 +92,14 @@ struct xen_ipi_handler
 static struct xen_ipi_handler xen_ipis[] = 
 {
 	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
+#ifdef __amd64__
+	[IPI_TO_IDX(IPI_INVLOP)]	= { xen_invlop,			"itlb"},
+#else
 	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
 	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
 	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
 	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
+#endif
 	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
 	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
 	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
@@ -454,71 +462,24 @@ xen_smp_rendezvous_action(void *arg)
 	return (FILTER_HANDLED);
 }
 
-static int
-xen_invltlb(void *arg)
-{
-
-	invltlb_handler();
-	return (FILTER_HANDLED);
-}
-
 #ifdef __amd64__
 static int
-xen_invltlb_invpcid(void *arg)
-{
-
-	invltlb_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invltlb_pcid(void *arg)
+xen_invlop(void *arg)
 {
 
-	invltlb_pcid_handler();
+	invlop_handler();
 	return (FILTER_HANDLED);
 }
 
-static int
-xen_invltlb_invpcid_pti(void *arg)
-{
-
-	invltlb_invpcid_pti_handler();
-	return (FILTER_HANDLED);
-}
+#else /* __i386__ */
 
 static int
-xen_invlpg_invpcid_handler(void *arg)
-{
-
-	invlpg_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlpg_pcid_handler(void *arg)
-{
-
-	invlpg_pcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlrng_invpcid_handler(void *arg)
-{
-
-	invlrng_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlrng_pcid_handler(void *arg)
+xen_invltlb(void *arg)
 {
 
-	invlrng_pcid_handler();
+	invltlb_handler();
 	return (FILTER_HANDLED);
 }
-#endif
 
 static int
 xen_invlpg(void *arg)
@@ -543,6 +504,7 @@ xen_invlcache(void *arg)
 	invlcache_handler();
 	return (FILTER_HANDLED);
 }
+#endif /* __amd64__ */
 
 static int
 xen_cpustop_handler(void *arg)
@@ -598,22 +560,6 @@ xen_setup_cpus(void)
 	if (!xen_vector_callback_enabled)
 		return;
 
-#ifdef __amd64__
-	if (pmap_pcid_enabled) {
-		if (pti)
-			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
-			    invpcid_works ? xen_invltlb_invpcid_pti :
-			    xen_invltlb_pcid;
-		else
-			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
-			    invpcid_works ? xen_invltlb_invpcid :
-			    xen_invltlb_pcid;
-		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ?
-		    xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler;
-		xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ?
-		    xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler;
-	}
-#endif
 	CPU_FOREACH(i)
 		xen_cpu_ipi_init(i);