From 638564e79287066f6942a1c883caa1d726ab4781 Mon Sep 17 00:00:00 2001 From: "Benjamin Thomas (Aviansie Ben)" Date: Thu, 30 Jul 2020 14:47:42 +0000 Subject: [PATCH 1/4] Clean up some unused declarations in Trampoline.cpp Co-authored-by: Julian Wang Signed-off-by: Ben Thomas --- runtime/compiler/p/codegen/Trampoline.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/runtime/compiler/p/codegen/Trampoline.cpp b/runtime/compiler/p/codegen/Trampoline.cpp index ae6bd40c4d7..b27fa33c122 100644 --- a/runtime/compiler/p/codegen/Trampoline.cpp +++ b/runtime/compiler/p/codegen/Trampoline.cpp @@ -34,24 +34,11 @@ namespace TR { class PersistentInfo; } -#if defined(TR_TARGET_64BIT) -#define TRAMPOLINE_SIZE 28 -#define OFFSET_IPIC_TO_CALL 36 -#else -#define TRAMPOLINE_SIZE 16 -#define OFFSET_IPIC_TO_CALL 32 -#endif - extern "C" { extern int __j9_smp_flag; - int32_t ppcTrampolineInitByCodeCache(TR_FrontEnd *, uint8_t *, uintptr_t); }; -#ifdef TR_HOST_POWER -extern void ppcCodeSync(uint8_t *, uint32_t); -#endif - void * ppcPicTrampInit(TR_FrontEnd *vm, TR::PersistentInfo * persistentInfo) { void *retVal = 0; From 756b5d2de054ad7bf1b608bcf4fe4de7addc59c9 Mon Sep 17 00:00:00 2001 From: "Benjamin Thomas (Aviansie Ben)" Date: Tue, 4 Aug 2020 17:15:04 +0000 Subject: [PATCH 2/4] Move CPU detection before code cache initialization With the introduction of POWER10, how the code cache is set up will depend on whether or not POWER10 support is enabled. However, code cache initialization is currently being run prior to CPU detection. In order to make things work correctly, the correct CPU must be detected earlier in the bootstrap process. Signed-off-by: Ben Thomas --- runtime/compiler/control/rossa.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/compiler/control/rossa.cpp b/runtime/compiler/control/rossa.cpp index 7760db565af..55fa843ccd4 100644 --- a/runtime/compiler/control/rossa.cpp +++ b/runtime/compiler/control/rossa.cpp @@ -1310,6 +1310,8 @@ onLoadInternal( // Now that the options have been processed we can initialize the RuntimeAssumptionTables // If we cannot allocate various runtime assumption hash tables, fail the JVM + fe->initializeSystemProperties(); + // Allocate trampolines for z/OS 64-bit #if defined(J9ZOS390) if (TR::Options::getCmdLineOptions()->getOption(TR_EnableRMODE64) && !isQuickstart) @@ -1467,8 +1469,6 @@ onLoadInternal( jitConfig->thunkLookUpNameAndSig = &j9ThunkLookupNameAndSig; - fe->initializeSystemProperties(); - TR::CompilationInfo * compInfo = TR::CompilationInfo::get(); // Now that we have all options (and before starting the compilation thread) we From 3b66ee95c3f68a717b9d397aaabf116172fdd60b Mon Sep 17 00:00:00 2001 From: "Benjamin Thomas (Aviansie Ben)" Date: Thu, 30 Jul 2020 15:20:49 +0000 Subject: [PATCH 3/4] Rework trampolines for POWER10 Since POWER10 supports PC-relative addressing, it is no longer necessary to use the pTOC in trampolines. Instead, a PC-relative load can be used to materialize the address to jump to. This also eliminates the need for temporary trampolines, since trampolines can be atomically patched in-place. The trampoline size has also been set to be a fixed size of 32 bytes. This is done since indirect branch target prediction works based off of 32-byte blocks, meaning that multiple indirect branches in the same 32-byte block will alias each other, causing unnecessary mispredictions. Co-authored-by: Julian Wang Signed-off-by: Ben Thomas --- runtime/compiler/runtime/Trampoline.cpp | 203 ++++++++++++++++-------- 1 file changed, 138 insertions(+), 65 deletions(-) diff --git a/runtime/compiler/runtime/Trampoline.cpp b/runtime/compiler/runtime/Trampoline.cpp index 66d1d56d016..af81bd580ee 100644 --- a/runtime/compiler/runtime/Trampoline.cpp +++ b/runtime/compiler/runtime/Trampoline.cpp @@ -36,11 +36,14 @@ #if defined(TR_TARGET_POWER) +// Target address prediction is based on 32-byte blocks on POWER +// we adjust the trampoline size to align with this block-size +// regardless 32bit or 64bit. +#define TRAMPOLINE_SIZE 32 + #if defined(TR_TARGET_64BIT) -#define TRAMPOLINE_SIZE 28 #define OFFSET_IPIC_TO_CALL 36 #else -#define TRAMPOLINE_SIZE 16 #define OFFSET_IPIC_TO_CALL 32 #endif @@ -52,13 +55,12 @@ void ppcCodeCacheConfig(int32_t ccSizeInByte, int32_t *numTempTrampolines) { // Estimated: 2KB per method, with 10% being recompiled(multi-times) - *numTempTrampolines = ccSizeInByte>>12; + *numTempTrampolines = TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10) ? 0 : (ccSizeInByte>>13); } void ppcCreateHelperTrampolines(uint8_t *trampPtr, int32_t numHelpers) { TR::CodeCacheConfig &config = TR::CodeCacheManager::instance()->codeCacheConfig(); - static bool customP4 = feGetEnv("TR_CustomP4Trampoline") ? true : false; uint8_t *bufferStart = trampPtr, *buffer; for (int32_t cookie=1; cookietarget.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + if (!TR::Options::getCmdLineOptions()->getOption(TR_DisableTOC)) + { + // ld gr11, [grPTOC, 8*(cookie-1)] + *(int32_t *)buffer = 0xe9700000 | (((cookie-1)*sizeof(intptr_t)) & 0x0000ffff); + buffer += 4; + } + else + { + // only gr11 is available for helper dispatch + + // lis gr11, upper 16-bits + *(int32_t *)buffer = 0x3d600000 | ((helper>>48) & 0x0000ffff); + buffer += 4; + + // oris gr11, gr11, bits 16--31 + *(int32_t *)buffer = 0x656b0000 | ((helper>>32) & 0x0000ffff); + buffer += 4; + + // rldicr gr11, gr11, 32, 31 + *(int32_t *)buffer = 0x796b07c6; + buffer += 4; + + // oris gr11, gr11, bits 32-47 + *(int32_t *)buffer = 0x656b0000 | ((helper>>16) & 0x0000ffff); + buffer += 4; + + // ori gr11, gr11, bits 48--63 + *(int32_t *)buffer = 0x616b0000 | (helper & 0x0000ffff); + buffer += 4; + } + } + else + { + // pld gr11, [,16], 1 (PC-relative) + *(int32_t *)buffer = 0x04100000; + buffer += 4; + *(int32_t *)buffer = 0xe5600010; buffer += 4; + } #else + if (!TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { // For POWER4 which has a problem with the CTR/LR cache when the upper // bits are not 0 extended.. Use li/oris when the 16th bit is off - if (!(helper & 0x00008000) ) + if (!(helper & 0x00008000)) { // li r11, lower *(int32_t *)buffer = 0x39600000 | (helper & 0x0000ffff); @@ -95,16 +137,22 @@ void ppcCreateHelperTrampolines(uint8_t *trampPtr, int32_t numHelpers) *(int32_t *)buffer = 0x396b0000 | (helper & 0x0000ffff); buffer += 4; - // Now, if highest bit is on we need to clear the sign extend bits on 64bit CPUs - // ** POWER4 pref fix ** - if ((helper & 0x80000000) && (!customP4 || TR::comp()->target().cpu.is(OMR_PROCESSOR_PPC_GP))) + if (helper & 0x80000000) { // rlwinm r11,r11,sh=0,mb=0,me=31 *(int32_t *)buffer = 0x556b003e; buffer += 4; } } - + } + else + { + // plwz gr11, [,16], 1 (PC-relative) + *(int32_t *)buffer = 0x06100000; + buffer += 4; + *(int32_t *)buffer = 0x81600010; + buffer += 4; + } #endif // mtctr r11 @@ -114,46 +162,61 @@ void ppcCreateHelperTrampolines(uint8_t *trampPtr, int32_t numHelpers) // bctr *(int32_t *)buffer = 0x4e800420; buffer += 4; - } + + if (TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + *(intptr_t *)buffer = helper; + } + } + #ifdef TR_HOST_POWER ppcCodeSync(trampPtr, config.trampolineCodeSize() * numHelpers); #endif - } void ppcCreateMethodTrampoline(void *trampPtr, void *startPC, void *method) { - static bool customP4 = feGetEnv("TR_CustomP4Trampoline") ? true : false; uint8_t *buffer = (uint8_t *)trampPtr; J9::PrivateLinkage::LinkageInfo *linkInfo = J9::PrivateLinkage::LinkageInfo::get(startPC); intptr_t dispatcher = (intptr_t)((uint8_t *)startPC + linkInfo->getReservedWord()); // Take advantage of both gr0 and gr11 ... #if defined(TR_TARGET_64BIT) - // lis gr0, upper 16-bits - *(int32_t *)buffer = 0x3c000000 | ((dispatcher>>48) & 0x0000ffff); - buffer += 4; + if (!TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + // lis gr0, upper 16-bits + *(int32_t *)buffer = 0x3c000000 | ((dispatcher>>48) & 0x0000ffff); + buffer += 4; - // lis gr11, bits 32--47 - *(int32_t *)buffer = 0x3d600000 | ((dispatcher>>16) & 0x0000ffff); - buffer += 4; + // lis gr11, bits 32--47 + *(int32_t *)buffer = 0x3d600000 | ((dispatcher>>16) & 0x0000ffff); + buffer += 4; - // ori gr0, gr0, bits 16-31 - *(int32_t *)buffer = 0x60000000 | ((dispatcher>>32) & 0x0000ffff); - buffer += 4; + // ori gr0, gr0, bits 16-31 + *(int32_t *)buffer = 0x60000000 | ((dispatcher>>32) & 0x0000ffff); + buffer += 4; - // ori gr11, gr11, bits 48--63 - *(int32_t *)buffer = 0x616b0000 | (dispatcher & 0x0000ffff); - buffer += 4; + // ori gr11, gr11, bits 48--63 + *(int32_t *)buffer = 0x616b0000 | (dispatcher & 0x0000ffff); + buffer += 4; - // rldimi gr11, gr0, 32, 0 - *(int32_t *)buffer = 0x780b000e; - buffer += 4; + // rldimi gr11, gr0, 32, 0 + *(int32_t *)buffer = 0x780b000e; + buffer += 4; + } + else + { + // pld gr11, [,16], 1 (PC-relative) + *(int32_t *)buffer = 0x04100000; + buffer += 4; + *(int32_t *)buffer = 0xe5600010; + buffer += 4; + } #else - // For POWER4 which has a problem with the CTR/LR cache when the upper - // bits are not 0 extended. Use li/oris when the 16th bit is off - if (customP4) + if (!TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) { + // For POWER4 which has a problem with the CTR/LR cache when the upper + // bits are not 0 extended. Use li/oris when the 16th bit is off if (!(dispatcher & 0x00008000)) { // li r11, lower @@ -167,16 +230,15 @@ void ppcCreateMethodTrampoline(void *trampPtr, void *startPC, void *method) else { // lis gr11, upper - *(int32_t *)buffer = 0x3d600000 | (((dispatcher>>16) + (dispatcher&(1<<15)?1:0)) & 0x0000ffff); + *(int32_t *)buffer = 0x3d600000 | + (((dispatcher>>16) + (dispatcher&(1<<15)?1:0)) & 0x0000ffff); buffer += 4; // addi gr11, gr11, lower *(int32_t *)buffer = 0x396b0000 | (dispatcher & 0x0000ffff); buffer += 4; - // Now, if highest bit is on we need to clear the sign extend bits on 64bit CPUs - // ** POWER4 pref fix ** - if ((dispatcher & 0x80000000) && (customP4 && TR::comp()->target().cpu.is(OMR_PROCESSOR_PPC_GP))) + if (dispatcher & 0x80000000) { // rlwinm r11,r11,sh=0,mb=0,me=31 *(int32_t *)buffer = 0x556b003e; @@ -186,12 +248,10 @@ void ppcCreateMethodTrampoline(void *trampPtr, void *startPC, void *method) } else { - // lis gr11, upper - *(int32_t *)buffer = 0x3d600000 | (((dispatcher>>16) + (dispatcher&(1<<15)?1:0)) & 0x0000ffff); + // plwz gr11, [,16], 1 (PC-relative) + *(int32_t *)buffer = 0x06100000; buffer += 4; - - // addi gr11, gr11, lower - *(int32_t *)buffer = 0x396b0000 | (dispatcher & 0x0000ffff); + *(int32_t *)buffer = 0x81600010; buffer += 4; } #endif @@ -202,6 +262,12 @@ void ppcCreateMethodTrampoline(void *trampPtr, void *startPC, void *method) // bcctr *(int32_t *)buffer = 0x4e800420; + buffer += 4; + + if (TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + *(intptr_t *)buffer = dispatcher; + } #if defined(TR_HOST_POWER) TR::CodeCacheConfig &config = TR::CodeCacheManager::instance()->codeCacheConfig(); @@ -322,33 +388,52 @@ bool ppcCodePatching(void *method, void *callSite, void *currentPC, void *curren } else { - void *newTramp = mcc_replaceTrampoline(reinterpret_cast(method), callSite, currentTramp, currentPC, newPC, true); + // On POWER10 or later, the trampoline can be patched in place atomically. No need temporary trampoline anymore + + void *newTramp = mcc_replaceTrampoline(reinterpret_cast(method), callSite, currentTramp, currentPC, + newPC, !TR::Compiler->target.cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)); if (newTramp == NULL) { //if (currentTramp == NULL) //FIXME we need an assume for runtime as well - TR_ASSERT(0, "This is an internal error.\n"); return false; } - ppcCreateMethodTrampoline(newTramp, newPC, method); + + // currentTramp==NULL or newTramp is a temporary trampoline + if (newTramp != currentTramp) + ppcCreateMethodTrampoline(newTramp, newPC, method); + if (currentTramp == NULL) { distance = (uint8_t *)newTramp - patchAddr; } else { - if (currentDistance != ((uint8_t *)currentTramp - patchAddr)) + if (currentTramp == newTramp) + { + // this effectively is: we are on POWER10 or later, and we can patch the trampoline in place + + *(uint8_t **)((uint8_t *)currentTramp + 16) = entryAddress; + distance = (uint8_t *)currentTramp - patchAddr; + } + else { - oldBits |= ((uint8_t *)currentTramp - patchAddr) & 0x03fffffc; - *(int32_t *)patchAddr = oldBits; + // this effectively is: we are on pre-POWER10, and we need to take care of temporary trampolines + + if (currentDistance != ((uint8_t *)currentTramp - patchAddr)) + { + oldBits |= ((uint8_t *)currentTramp - patchAddr) & 0x03fffffc; + *(int32_t *)patchAddr = oldBits; #if defined(TR_HOST_POWER) - ppcCodeSync(patchAddr, 4); + ppcCodeSync(patchAddr, 4); #endif - } + } - patchAddr = (uint8_t *)currentTramp; - distance = (uint8_t *)newTramp - patchAddr; - currentDistance = 0; - oldBits = 0x48000000; + patchAddr = (uint8_t *)currentTramp; + distance = (uint8_t *)newTramp - patchAddr; + currentDistance = 0; + oldBits = 0x48000000; + } } } } @@ -459,20 +544,8 @@ bool ppcCodePatching(void *method, void *callSite, void *currentPC, void *curren void ppcCodeCacheParameters(int32_t *trampolineSize, void **callBacks, int32_t *numHelpers, int32_t* CCPreLoadedCodeSize) { - static bool customP4 = feGetEnv("TR_CustomP4Trampoline") ? true : false; - -#if defined(TR_TARGET_64BIT) *trampolineSize = TRAMPOLINE_SIZE; -#else - if (customP4) - { - *trampolineSize = TR::comp()->target().cpu.is(OMR_PROCESSOR_PPC_GP) ? TRAMPOLINE_SIZE + 4 : TRAMPOLINE_SIZE; - } - else - { - *trampolineSize = TRAMPOLINE_SIZE + 4; - } -#endif + //TR::CodeCacheConfig &config = TR::CodeCacheManager::instance()->codeCacheConfig(); //fprintf(stderr, "Processor Offset: %d\n", portLibCall_getProcessorType() - TR_FirstPPCProcessor); //fprintf(stderr, "Trampoline Size: %d, %d\n", *trampolineSize, config.trampolineCodeSize); From 086094d58e3d63d2700a52f088a62139145c15d4 Mon Sep 17 00:00:00 2001 From: "Benjamin Thomas (Aviansie Ben)" Date: Thu, 30 Jul 2020 14:50:15 +0000 Subject: [PATCH 4/4] Disable initTOC call when disableTOC option is enabled Previously, TR_PPCTableOfConstants::initTOC would be called even if the disableTOC option was enabled, causing memory to be allocated for the pTOC even if it was disabled. This call is now guarded by a check of this option. Co-authored-by: Julian Wang Signed-off-by: Ben Thomas --- runtime/compiler/p/codegen/Trampoline.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime/compiler/p/codegen/Trampoline.cpp b/runtime/compiler/p/codegen/Trampoline.cpp index b27fa33c122..dd4b8673a8d 100644 --- a/runtime/compiler/p/codegen/Trampoline.cpp +++ b/runtime/compiler/p/codegen/Trampoline.cpp @@ -52,7 +52,8 @@ void * ppcPicTrampInit(TR_FrontEnd *vm, TR::PersistentInfo * persistentInfo) #ifdef TR_TARGET_64BIT TR_J9VMBase *fej9 = (TR_J9VMBase *)vm; - if (!fej9->isAOT_DEPRECATED_DO_NOT_USE()) // don't init TOC if it is jar2jxe AOT compile + if (!fej9->isAOT_DEPRECATED_DO_NOT_USE() && // don't init TOC if it is jar2jxe AOT compile + !TR::Options::getCmdLineOptions()->getOption(TR_DisableTOC)) { retVal = TR_PPCTableOfConstants::initTOC(fej9, persistentInfo, 0); }