From 29d443765e9b184cd5de7c275c983f0ef254d912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Tue, 31 Mar 2026 13:49:11 +0900 Subject: [PATCH] Fix ARM64 interface dispatch cache torn read On ARM64, the CHECK_CACHE_ENTRY macro read m_pInstanceType and m_pTargetCode from a cache entry using two separate ldr instructions separated by a control dependency (cmp/bne). ARM64's weak memory model does not order loads across control dependencies, so the hardware can speculatively satisfy the second load (target) before the first (type) commits. When a concurrent thread atomically populates the entry via stlxp/casp (UpdateCacheEntryAtomically), the reader can observe the new m_pInstanceType but the old m_pTargetCode (0), then br to address 0. Fix by using ldp to load both fields in a single instruction (single-copy atomic on FEAT_LSE2 / ARMv8.4+ hardware), plus a cbz guard to catch torn reads on pre-LSE2 hardware where ldp pair atomicity is not architecturally guaranteed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/runtime/arm64/StubDispatch.S | 25 +++++++++++++++++----- src/coreclr/runtime/arm64/StubDispatch.asm | 25 +++++++++++++++++----- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/coreclr/runtime/arm64/StubDispatch.S b/src/coreclr/runtime/arm64/StubDispatch.S index 750a99db42638a..6ff65e4b9b18ae 100644 --- a/src/coreclr/runtime/arm64/StubDispatch.S +++ b/src/coreclr/runtime/arm64/StubDispatch.S @@ -16,15 +16,30 @@ // Macro that generates code to check a single cache entry. .macro CHECK_CACHE_ENTRY entry // Check a single entry in the cache. - // x9 : Cache data structure. Also used for target address jump. + // x9 : Cache data structure // x10 : Instance MethodTable* // x11 : Indirection cell address, preserved - // x12 : Trashed - ldr x12, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + (\entry * 16))] + // x12, x13 : Trashed + // + // Use ldp to load both m_pInstanceType and m_pTargetCode in a single instruction. + // On ARM64 two separate ldr instructions can be reordered across a control dependency, + // which means a concurrent atomic cache entry update (via stlxp) could be observed as a + // torn read (new type, old target). ldp is single-copy atomic for the pair on FEAT_LSE2 + // hardware (ARMv8.4+). The cbz guard ensures correctness on pre-LSE2 hardware too: + // a torn read can only produce a zero target (entries go from 0,0 to type,target), + // so we treat it as a cache miss. + .if (OFFSETOF__InterfaceDispatchCache__m_rgEntries + (\entry * 16)) > 504 + // ldp's signed immediate offset must be in [-512,504] for 64-bit registers. + // Use add to reach far entries in the 32/64 slot stubs. + add x12, x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + (\entry * 16)) + ldp x12, x13, [x12] + .else + ldp x12, x13, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + (\entry * 16))] + .endif cmp x10, x12 bne 0f - ldr x9, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + (\entry * 16) + 8)] - br x9 + cbz x13, 0f + br x13 0: .endm diff --git a/src/coreclr/runtime/arm64/StubDispatch.asm b/src/coreclr/runtime/arm64/StubDispatch.asm index 697d3a10f52e01..1609666fb0d327 100644 --- a/src/coreclr/runtime/arm64/StubDispatch.asm +++ b/src/coreclr/runtime/arm64/StubDispatch.asm @@ -13,15 +13,30 @@ MACRO CHECK_CACHE_ENTRY $entry ;; Check a single entry in the cache. - ;; x9 : Cache data structure. Also used for target address jump. + ;; x9 : Cache data structure ;; x10 : Instance MethodTable* ;; x11 : Indirection cell address, preserved - ;; x12 : Trashed - ldr x12, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16))] + ;; x12, x13 : Trashed + ;; + ;; Use ldp to load both m_pInstanceType and m_pTargetCode in a single instruction. + ;; On ARM64 two separate ldr instructions can be reordered across a control dependency, + ;; which means a concurrent atomic cache entry update (via stlxp) could be observed as a + ;; torn read (new type, old target). ldp is single-copy atomic for the pair on FEAT_LSE2 + ;; hardware (ARMv8.4+). The cbz guard ensures correctness on pre-LSE2 hardware too: + ;; a torn read can only produce a zero target (entries go from 0,0 to type,target), + ;; so we treat it as a cache miss. + IF (OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16)) > 504 + ;; ldp's signed immediate offset must be in [-512,504] for 64-bit registers. + ;; Use add to reach far entries in the 32/64 slot stubs. + add x12, x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16)) + ldp x12, x13, [x12] + ELSE + ldp x12, x13, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16))] + ENDIF cmp x10, x12 bne %ft0 - ldr x9, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16) + 8)] - br x9 + cbz x13, %ft0 + br x13 0 MEND