diff --git a/compiler/aarch64/codegen/OMRCodeGenerator.cpp b/compiler/aarch64/codegen/OMRCodeGenerator.cpp index d390df1d6ff..0235f1db454 100644 --- a/compiler/aarch64/codegen/OMRCodeGenerator.cpp +++ b/compiler/aarch64/codegen/OMRCodeGenerator.cpp @@ -812,3 +812,22 @@ TR::Instruction *OMR::ARM64::CodeGenerator::generateDebugCounterBump(TR::Instruc srm.reclaimScratchRegister(counterReg); return cursor; } + +bool +OMR::ARM64::CodeGenerator::supportsNonHelper(TR::SymbolReferenceTable::CommonNonhelperSymbol symbol) + { + bool result = false; + + switch (symbol) + { + case TR::SymbolReferenceTable::atomicAddSymbol: + case TR::SymbolReferenceTable::atomicFetchAndAddSymbol: + case TR::SymbolReferenceTable::atomicSwapSymbol: + { + result = true; + break; + } + } + + return result; + } diff --git a/compiler/aarch64/codegen/OMRCodeGenerator.hpp b/compiler/aarch64/codegen/OMRCodeGenerator.hpp index c7f1aceaa0c..bc7a06e9c92 100644 --- a/compiler/aarch64/codegen/OMRCodeGenerator.hpp +++ b/compiler/aarch64/codegen/OMRCodeGenerator.hpp @@ -504,6 +504,24 @@ class OMR_EXTENSIBLE CodeGenerator : public OMR::CodeGenerator */ static uint32_t registerBitMask(int32_t reg); + /** + * @brief Generates an inlined instruction sequence instead of a direct call + * + * @param[in] node: node + * @param[inout] resultReg: resultReg + * + * @return true if an inlined instruction sequence is generated + */ + bool inlineDirectCall(TR::Node *node, TR::Register *&resultReg); + + /** + * @brief Answers if intrinsics for the symbol is supported + * + * @param[in] symbol: symbol + * @return true if intrinsics for the symbol is supported + */ + bool supportsNonHelper(TR::SymbolReferenceTable::CommonNonhelperSymbol symbol); + /** * @brief Answers whether bit operations are supported or not * @return true if supported, false otherwise diff --git a/compiler/aarch64/codegen/OMRMemoryReference.cpp b/compiler/aarch64/codegen/OMRMemoryReference.cpp index 98bd7f9b3b1..da473da710d 100644 --- a/compiler/aarch64/codegen/OMRMemoryReference.cpp +++ b/compiler/aarch64/codegen/OMRMemoryReference.cpp @@ -793,7 +793,9 @@ static bool isImm7OffsetGPRInstruction(uint32_t enc) static bool isExclusiveMemAccessInstruction(TR::InstOpCode::Mnemonic op) { return (op == TR::InstOpCode::ldxrx || op == TR::InstOpCode::ldxrw || - op == TR::InstOpCode::stxrx || op == TR::InstOpCode::stxrw); + op == TR::InstOpCode::ldaxrx || op == TR::InstOpCode::ldaxrw || + op == TR::InstOpCode::stxrx || op == TR::InstOpCode::stxrw || + op == TR::InstOpCode::stlxrx || op == TR::InstOpCode::stlxrw); } diff --git a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp index 01c73a3526b..e6d3071d46c 100644 --- a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp +++ b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp @@ -22,6 +22,7 @@ #include "codegen/ARM64Instruction.hpp" #include "codegen/ARM64ShiftCode.hpp" #include "codegen/CodeGenerator.hpp" +#include "codegen/CodeGenerator_inlines.hpp" #include "codegen/ConstantDataSnippet.hpp" #include "codegen/GenerateInstructions.hpp" #include "codegen/Linkage.hpp" @@ -707,11 +708,16 @@ OMR::ARM64::TreeEvaluator::checkcastAndNULLCHKEvaluator(TR::Node *node, TR::Code TR::Register * OMR::ARM64::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - TR::SymbolReference *symRef = node->getSymbolReference(); - TR::MethodSymbol *callee = symRef->getSymbol()->castToMethodSymbol(); - TR::Linkage *linkage = cg->getLinkage(callee->getLinkageConvention()); + TR::Register *resultReg; + if (!cg->inlineDirectCall(node, resultReg)) + { + TR::SymbolReference *symRef = node->getSymbolReference(); + TR::MethodSymbol *callee = symRef->getSymbol()->castToMethodSymbol(); + TR::Linkage *linkage = cg->getLinkage(callee->getLinkageConvention()); - return linkage->buildDirectDispatch(node); + resultReg = linkage->buildDirectDispatch(node); + } + return resultReg; } // handles calli, icalli, lcalli, fcalli, dcalli, acalli @@ -1033,3 +1039,371 @@ OMR::ARM64::TreeEvaluator::generateVFTMaskInstruction(TR::CodeGenerator *cg, TR: // Do nothing in OMR return preced; } + +/** + * @brief Inlines an intrinsic for calls to atomicAddSymbol which are represented by a call node of the form for 32-bit (64-bit similar): + * + * This implies `volatile` memory access mode. + * + * @code + * icall + *
+ * + * @endcode + * + * Which performs the following operation atomically: + * + * @code + * [address] = [address] + + * return + * @endcode + * + * @param node: The respective (i|l)call node. + * @param cg: The code generator used to generate the instructions. + * @returns A register holding the node. + */ +static TR::Register *intrinsicAtomicAdd(TR::Node *node, TR::CodeGenerator *cg) + { + TR::Node *addressNode = node->getChild(0); + TR::Node *valueNode = node->getChild(1); + + TR::Register *addressReg = cg->evaluate(addressNode); + TR::Register *valueReg = cg->evaluate(valueNode); + const bool is64Bit = valueNode->getDataType().isInt64(); + + TR::Register *oldValueReg = cg->allocateRegister(); + + /* + * Generating non-intuitive instruction sequence which uses load exclusive register + * and store release exclusive register followed by full memory barrier. + * + * Because this atomic add has `volatile` semantics, + * no loads/stores before this sequence can be reordred after it and + * no loads/stores after it can be reordered before it. + * + * loop: + * ldxrx oldValueReg, [addressReg] + * addx newValueReg, oldValueReg, valueReg + * stlxrx oldValueReg, newValueReg, [addressReg] + * cbnzx oldValueReg, loop + * dmb ish + * + * For rationale behind this instruction sequence, + * see https://patchwork.kernel.org/project/linux-arm-kernel/patch/1391516953-14541-1-git-send-email-will.deacon@arm.com/ + * + */ + + TR::LabelSymbol *doneLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + TR::LabelSymbol *loopLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + + loopLabel->setStartInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel); + + auto loadop = is64Bit ? TR::InstOpCode::ldxrx : TR::InstOpCode::ldxrw; + auto faultingInstruction = generateTrg1MemInstruction(cg, loadop, node, oldValueReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg)); + + TR::Register *newValueReg = cg->allocateRegister(); + generateTrg1Src2Instruction(cg, (is64Bit ? TR::InstOpCode::addx : TR::InstOpCode::addw), node, newValueReg, oldValueReg, valueReg); + + // store release exclusive register + auto storeop = is64Bit ? TR::InstOpCode::stlxrx : TR::InstOpCode::stlxrw; + generateTrg1MemSrc1Instruction(cg, storeop, node, oldValueReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg), newValueReg); + generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, oldValueReg, loopLabel); + + generateSynchronizationInstruction(cg, TR::InstOpCode::dmb, node, 0xB); // dmb ish + + //Set the conditions and dependencies + auto conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 4, cg->trMemory()); + + conditions->addPostCondition(newValueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(oldValueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(addressReg, TR::RealRegister::NoReg); + conditions->addPostCondition(valueReg, TR::RealRegister::NoReg); + + doneLabel->setEndInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); + + node->setRegister(valueReg); + cg->stopUsingRegister(oldValueReg); + cg->stopUsingRegister(newValueReg); + + cg->decReferenceCount(addressNode); + cg->decReferenceCount(valueNode); + + return valueReg; + } + +/** + * @brief Inlines an intrinsic for calls to atomicFetchAndAddSymbol which are represented by a call node of the form for 32-bit (64-bit similar): + * + * This implies `volatile` memory access mode. + * + * @code + * icall + *
+ * + * @endcode + * + * Which performs the following operation atomically: + * + * @code + * temp = [address] + * [address] = [address] + + * return temp + * @endcode + * + * @param node: The respective (i|l)call node. + * @param cg: The code generator used to generate the instructions. + * @returns A register holding the original value in memory (before the addition) at the
location. + */ +TR::Register *intrinsicAtomicFetchAndAdd(TR::Node *node, TR::CodeGenerator *cg) + { + TR::Node *addressNode = node->getChild(0); + TR::Node *valueNode = node->getChild(1); + + TR::Register *addressReg = cg->evaluate(addressNode); + TR::Register *valueReg = NULL; + const bool is64Bit = valueNode->getDataType().isInt64(); + int64_t value = 0; + bool negate = false; + bool killValueReg = false; + + if (valueNode->getOpCode().isLoadConst() && valueNode->getRegister() == NULL) + { + if (is64Bit) + { + value = valueNode->getLongInt(); + } + else + { + value = valueNode->getInt(); + } + if (!constantIsUnsignedImm12(value)) + { + if (constantIsUnsignedImm12(-value)) + { + negate = true; + } + else + { + valueReg = cg->allocateRegister(); + killValueReg = true; + if(is64Bit) + { + loadConstant64(cg, node, value, valueReg); + } + else + { + loadConstant32(cg, node, value, valueReg); + } + } + } + } + else + { + valueReg = cg->evaluate(valueNode); + } + + TR::Register *oldValueReg = cg->allocateRegister(); + TR::Register *tempReg = cg->allocateRegister(); + + /* + * Generating non-intuitive instruction sequence which uses load exclusive register + * and store release exclusive register followed by full memory barrier. + * + * Because this atomic add has `volatile` semantics, + * no loads/stores before this sequence can be reordred after it and + * no loads/stores after it can be reordered before it. + * + * loop: + * ldxrx oldValueReg, [addressReg] + * addx newValueReg, oldValueReg, valueReg + * stlxrx tempReg, newValueReg, [addressReg] + * cbnzx tempReg, loop + * dmb ish + * + * For rationale behind this instruction sequence, + * see https://patchwork.kernel.org/project/linux-arm-kernel/patch/1391516953-14541-1-git-send-email-will.deacon@arm.com/ + * + */ + + TR::LabelSymbol *doneLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + TR::LabelSymbol *loopLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + + loopLabel->setStartInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel); + + // load acquire exclusive register + auto loadop = is64Bit ? TR::InstOpCode::ldxrx : TR::InstOpCode::ldxrw; + generateTrg1MemInstruction(cg, loadop, node, oldValueReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg)); + + TR::Register *newValueReg = cg->allocateRegister(); + if (valueReg == NULL) + { + if (!negate) + { + generateTrg1Src1ImmInstruction(cg, (is64Bit ? TR::InstOpCode::addimmx : TR::InstOpCode::addimmw), node, newValueReg, oldValueReg, value); + } + else + { + generateTrg1Src1ImmInstruction(cg, (is64Bit ? TR::InstOpCode::subimmx : TR::InstOpCode::subimmw), node, newValueReg, oldValueReg, -value); + } + } + else + { + generateTrg1Src2Instruction(cg, (is64Bit ? TR::InstOpCode::addx : TR::InstOpCode::addw), node, newValueReg, oldValueReg, valueReg); + } + // store release exclusive register + auto storeop = is64Bit ? TR::InstOpCode::stlxrx : TR::InstOpCode::stlxrw; + generateTrg1MemSrc1Instruction(cg, storeop, node, tempReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg), newValueReg); + generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, tempReg, loopLabel); + + generateSynchronizationInstruction(cg, TR::InstOpCode::dmb, node, 0xB); // dmb ish + + //Set the conditions and dependencies + const int numDeps = (valueReg != NULL) ? 5 : 4; + auto conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, numDeps, cg->trMemory()); + + conditions->addPostCondition(newValueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(oldValueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(addressReg, TR::RealRegister::NoReg); + conditions->addPostCondition(tempReg, TR::RealRegister::NoReg); + if (valueReg != NULL) + { + conditions->addPostCondition(valueReg, TR::RealRegister::NoReg); + } + + doneLabel->setEndInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); + + node->setRegister(oldValueReg); + cg->stopUsingRegister(newValueReg); + cg->stopUsingRegister(tempReg); + if (killValueReg) + { + cg->stopUsingRegister(valueReg); + } + cg->decReferenceCount(addressNode); + cg->decReferenceCount(valueNode); + + return oldValueReg; + } + +/** + * @brief Inlines an intrinsic for calls to atomicSwapSymbol which are represented by a call node of the form for 32-bit (64-bit similar): + * + * This implies `volatile` memory access mode. + * + * @code + * icall + *
+ * + * @endcode + * + * Which performs the following operation atomically: + * + * @code + * temp = [address] + * [address] = + * return temp + * @endcode + * + * @param node: The respective (i|l)call node. + * @param cg: The code generator used to generate the instructions. + * @returns A register holding the original value in memory (before the swap) at the
location. + */ +TR::Register *intrinsicAtomicSwap(TR::Node *node, TR::CodeGenerator *cg) + { + TR::Node *addressNode = node->getChild(0); + TR::Node *valueNode = node->getChild(1); + + TR::Register *addressReg = cg->evaluate(addressNode); + TR::Register *valueReg = cg->evaluate(valueNode); + const bool is64Bit = valueNode->getDataType().isInt64(); + + TR::Register *oldValueReg = cg->allocateRegister(); + TR::Register *tempReg = cg->allocateRegister(); + + /* + * Generating non-intuitive instruction sequence which uses load exclusive register + * and store release exclusive register followed by full memory barrier. + * + * Because this atomic swap has `volatile` semantics, + * no loads/stores before this sequence can be reordred after it and + * no loads/stores after it can be reordered before it. + * + * loop: + * ldxrx oldValueReg, [addressReg] + * stlxrx tempReg, valueReg, [addressReg] + * cbnzx tempReg, loop + * dmb ish + * + * For rationale behind this instruction sequence, + * see https://patchwork.kernel.org/project/linux-arm-kernel/patch/1391516953-14541-1-git-send-email-will.deacon@arm.com/ + * + */ + + TR::LabelSymbol *doneLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + TR::LabelSymbol *loopLabel = TR::LabelSymbol::create(cg->trHeapMemory(), cg); + + loopLabel->setStartInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel); + + // load acquire exclusive register + auto loadop = is64Bit ? TR::InstOpCode::ldxrx : TR::InstOpCode::ldxrw; + generateTrg1MemInstruction(cg, loadop, node, oldValueReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg)); + + // store release exclusive register + auto storeop = is64Bit ? TR::InstOpCode::stlxrx : TR::InstOpCode::stlxrw; + generateTrg1MemSrc1Instruction(cg, storeop, node, tempReg, new (cg->trHeapMemory()) TR::MemoryReference(addressReg, 0, cg), valueReg); + generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, tempReg, loopLabel); + + generateSynchronizationInstruction(cg, TR::InstOpCode::dmb, node, 0xB); // dmb ish + + //Set the conditions and dependencies + auto conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 4, cg->trMemory()); + + conditions->addPostCondition(oldValueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(addressReg, TR::RealRegister::NoReg); + conditions->addPostCondition(valueReg, TR::RealRegister::NoReg); + conditions->addPostCondition(tempReg, TR::RealRegister::NoReg); + + doneLabel->setEndInternalControlFlow(); + generateLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); + + node->setRegister(oldValueReg); + cg->stopUsingRegister(tempReg); + + cg->decReferenceCount(addressNode); + cg->decReferenceCount(valueNode); + + return valueReg; + } + +bool OMR::ARM64::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&resultReg) + { + TR::CodeGenerator *cg = self(); + TR::Compilation *comp = cg->comp(); + TR::SymbolReference* symRef = node->getSymbolReference(); + + if (symRef && symRef->getSymbol()->castToMethodSymbol()->isInlinedByCG()) + { + if (comp->getSymRefTab()->isNonHelper(symRef, TR::SymbolReferenceTable::atomicAddSymbol)) + { + resultReg = intrinsicAtomicAdd(node, cg); + return true; + } + else if (comp->getSymRefTab()->isNonHelper(symRef, TR::SymbolReferenceTable::atomicFetchAndAddSymbol)) + { + resultReg = intrinsicAtomicFetchAndAdd(node, cg); + return true; + } + else if (comp->getSymRefTab()->isNonHelper(symRef, TR::SymbolReferenceTable::atomicSwapSymbol)) + { + resultReg = intrinsicAtomicSwap(node, cg); + return true; + } + } + + return false; + }