From 448e5f103596a515ded571e07500ad14234f32c9 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Mon, 14 Jun 2021 22:50:03 -0700 Subject: [PATCH 01/18] Increase loop cloning max allowed condition blocks Allows inner loop of 3-nested loops (e.g., Array2 benchmark) to be cloned. --- src/coreclr/jit/loopcloning.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 0fe22420f783e..577d7ee4c1804 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1176,6 +1176,7 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con if (maxRank == -1) { + JITDUMP("> maxRank undefined\n"); return false; } @@ -1184,12 +1185,13 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con // So add 1 after rank * 2. unsigned condBlocks = (unsigned)maxRank * 2 + 1; - // Heuristic to not create too many blocks. - // REVIEW: due to the definition of `condBlocks`, above, the effective max is 3 blocks, meaning - // `maxRank` of 1. Question: should the heuristic allow more blocks to be created in some situations? - // REVIEW: make this based on a COMPlus configuration? - if (condBlocks > 4) + // Heuristic to not create too many blocks. Defining as 5 allows, effectively, loop cloning on + // triply-nested loops. + // REVIEW: make this based on a COMPlus configuration, at least for debug? + const unsigned maxAllowedCondBlocks = 5; + if (condBlocks > maxAllowedCondBlocks) { + JITDUMP("> Too many condition blocks (%u > %u)\n", condBlocks, maxAllowedCondBlocks); return false; } From aaeb315e29c8e24aebecda5393e67a6b9e826f9b Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 18 Jun 2021 12:26:09 -0700 Subject: [PATCH 02/18] Clear GTF_INX_RNGCHK bit on loop cloning created index nodes to avoid unnecessary bounds checks. Revert max cloning condition blocks to 3; allowing more doesn't seem to improve performance (probably too many conditions before a not-sufficiently-executed loop, at least for the Array2 benchmark) --- src/coreclr/jit/loopcloning.cpp | 72 ++++++++++++++++++++------------- src/coreclr/jit/loopcloning.h | 1 + 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 577d7ee4c1804..8a7ea5b7bad45 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -39,6 +39,10 @@ GenTree* LC_Array::ToGenTree(Compiler* comp, BasicBlock* bb) { arr = comp->gtNewIndexRef(TYP_REF, arr, comp->gtNewLclvNode(arrIndex->indLcls[i], comp->lvaTable[arrIndex->indLcls[i]].lvType)); + + // Clear the range check flag: we guarantee that all necessary range checking has already + // been done by the time this array index expression is invoked. + arr->gtFlags &= ~GTF_INX_RNGCHK; } // If asked for arrlen invoke arr length operator. if (oper == ArrLen) @@ -395,26 +399,30 @@ void LoopCloneContext::PrintBlockConditions(unsigned loopNum) { printf("Block conditions:\n"); - JitExpandArrayStack*>* levelCond = blockConditions[loopNum]; - if (levelCond == nullptr || levelCond->Size() == 0) + JitExpandArrayStack*>* blockConds = blockConditions[loopNum]; + if (blockConds == nullptr || blockConds->Size() == 0) { printf("No block conditions\n"); return; } - for (unsigned i = 0; i < levelCond->Size(); ++i) + for (unsigned i = 0; i < blockConds->Size(); ++i) + { + PrintBlockLevelConditions(i, (*blockConds)[i]); + } +} +void LoopCloneContext::PrintBlockLevelConditions(unsigned level, JitExpandArrayStack* levelCond) +{ + printf("%d = ", level); + for (unsigned j = 0; j < levelCond->Size(); ++j) { - printf("%d = {", i); - for (unsigned j = 0; j < ((*levelCond)[i])->Size(); ++j) + if (j != 0) { - if (j != 0) - { - printf(" & "); - } - (*((*levelCond)[i]))[j].Print(); + printf(" & "); } - printf("}\n"); + (*levelCond)[j].Print(); } + printf("\n"); } #endif @@ -650,19 +658,19 @@ void LoopCloneContext::PrintConditions(unsigned loopNum) { if (conditions[loopNum] == nullptr) { - JITDUMP("NO conditions"); + printf("NO conditions"); return; } if (conditions[loopNum]->Size() == 0) { - JITDUMP("Conditions were optimized away! Will always take cloned path."); + printf("Conditions were optimized away! Will always take cloned path."); return; } for (unsigned i = 0; i < conditions[loopNum]->Size(); ++i) { if (i != 0) { - JITDUMP(" & "); + printf(" & "); } (*conditions[loopNum])[i].Print(); } @@ -711,6 +719,9 @@ void LoopCloneContext::CondToStmtInBlock(Compiler* comp comp->fgInsertStmtAtEnd(block, stmt); // Remorph. + JITDUMP("Loop cloning condition tree before morphing:\n"); + DBEXEC(comp->verbose, comp->gtDispTree(jmpTrueTree)); + JITDUMP("\n"); comp->fgMorphBlockStmt(block, stmt DEBUGARG("Loop cloning condition")); } @@ -1000,9 +1011,9 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext return false; } } - JITDUMP("Conditions: ("); + JITDUMP("Conditions: "); DBEXEC(verbose, context->PrintConditions(loopNum)); - JITDUMP(")\n"); + JITDUMP("\n"); return true; } return false; @@ -1164,10 +1175,6 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con printf("Deref condition tree:\n"); for (unsigned i = 0; i < nodes.Size(); ++i) { - if (i != 0) - { - printf(","); - } nodes[i]->Print(); printf("\n"); } @@ -1185,10 +1192,10 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con // So add 1 after rank * 2. unsigned condBlocks = (unsigned)maxRank * 2 + 1; - // Heuristic to not create too many blocks. Defining as 5 allows, effectively, loop cloning on - // triply-nested loops. + // Heuristic to not create too many blocks. Defining as 3 allows, effectively, loop cloning on + // doubly-nested loops. // REVIEW: make this based on a COMPlus configuration, at least for debug? - const unsigned maxAllowedCondBlocks = 5; + const unsigned maxAllowedCondBlocks = 3; if (condBlocks > maxAllowedCondBlocks) { JITDUMP("> Too many condition blocks (%u > %u)\n", condBlocks, maxAllowedCondBlocks); @@ -1476,8 +1483,9 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, BasicBlock* head, BasicBlock* slowHead) { - JITDUMP("Inserting loop cloning conditions\n"); + JITDUMP("Inserting loop " FMT_LP " loop choice conditions\n", loopNum); assert(context->HasBlockConditions(loopNum)); + assert(head->bbJumpKind == BBJ_COND); BasicBlock* curCond = head; JitExpandArrayStack*>* levelCond = context->GetBlockConditions(loopNum); @@ -1486,10 +1494,15 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, bool isHeaderBlock = (curCond == head); // Flip the condition if header block. + JITDUMP("Adding loop " FMT_LP " level %u block conditions to " FMT_BB "\n ", loopNum, i, curCond->bbNum); + DBEXEC(verbose, context->PrintBlockLevelConditions(i, (*levelCond)[i])); context->CondToStmtInBlock(this, *((*levelCond)[i]), curCond, /*reverse*/ isHeaderBlock); // Create each condition block ensuring wiring between them. - BasicBlock* tmp = fgNewBBafter(BBJ_COND, isHeaderBlock ? slowHead : curCond, /*extendRegion*/ true); + BasicBlock* tmp = fgNewBBafter(BBJ_COND, isHeaderBlock ? slowHead : curCond, /*extendRegion*/ true); + tmp->inheritWeight(head); + tmp->bbNatLoopNum = head->bbNatLoopNum; + curCond->bbJumpDest = isHeaderBlock ? tmp : slowHead; JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", curCond->bbNum, curCond->bbJumpDest->bbNum); @@ -1502,13 +1515,13 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, } curCond = tmp; - - curCond->inheritWeight(head); - curCond->bbNatLoopNum = head->bbNatLoopNum; - JITDUMP("Created new " FMT_BB " for new level %u\n", curCond->bbNum, i); } // Finally insert cloning conditions after all deref conditions have been inserted. + JITDUMP("Adding loop " FMT_LP " cloning conditions to " FMT_BB "\n", loopNum, curCond->bbNum); + JITDUMP(" "); + DBEXEC(verbose, context->PrintConditions(loopNum)); + JITDUMP("\n"); context->CondToStmtInBlock(this, *(context->GetConditions(loopNum)), curCond, /*reverse*/ false); return curCond; } @@ -2235,6 +2248,7 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop // Check that the array object local variable is invariant within the loop body. if (!optIsStackLocalInvariant(info->loopNum, arrIndex.arrLcl)) { + JITDUMP("V%02d is not loop invariant\n", arrIndex.arrLcl); return WALK_SKIP_SUBTREES; } diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index 6cc921c520db1..ecb03234fdcd0 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -739,6 +739,7 @@ struct LoopCloneContext #ifdef DEBUG // Print the block conditions for the loop. void PrintBlockConditions(unsigned loopNum); + void PrintBlockLevelConditions(unsigned level, JitExpandArrayStack* levelCond); #endif // Does the loop have block conditions? From cdcc120696aca7051e40880262f8641685e7a8aa Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sun, 27 Jun 2021 14:14:41 -0700 Subject: [PATCH 03/18] Remove outer index bounds checks --- src/coreclr/jit/clrjit.natvis | 40 +++++++++++++++ src/coreclr/jit/compiler.h | 6 +-- src/coreclr/jit/loopcloning.cpp | 86 +++++++++++++++++++++++++++++++-- src/coreclr/jit/loopcloning.h | 18 +++---- 4 files changed, 131 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/clrjit.natvis b/src/coreclr/jit/clrjit.natvis index 90a9ff703a471..e4747c5a0731f 100644 --- a/src/coreclr/jit/clrjit.natvis +++ b/src/coreclr/jit/clrjit.natvis @@ -5,6 +5,13 @@ Licensed to the .NET Foundation under one or more agreements. The .NET Foundation licenses this file to you under the MIT license. --> + @@ -183,4 +190,37 @@ The .NET Foundation licenses this file to you under the MIT license. + + size={m_size,d} + Empty + + + m_size + m_members + + + + + + size={m_size,d} used={m_used,d} + Empty + + + m_used + m_members + + + + + + + + + {optType,en} + + (LcJaggedArrayOptInfo*)this,nd + (LcMdArrayOptInfo*)this,nd + + + diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 356fe6498d166..2f8cf7062e9d5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6209,9 +6209,9 @@ class Compiler public: void optInit(); - GenTree* Compiler::optRemoveRangeCheck(GenTreeBoundsChk* check, GenTree* comma, Statement* stmt); - GenTree* Compiler::optRemoveStandaloneRangeCheck(GenTreeBoundsChk* check, Statement* stmt); - void Compiler::optRemoveCommaBasedRangeCheck(GenTree* comma, Statement* stmt); + GenTree* optRemoveRangeCheck(GenTreeBoundsChk* check, GenTree* comma, Statement* stmt); + GenTree* optRemoveStandaloneRangeCheck(GenTreeBoundsChk* check, Statement* stmt); + void optRemoveCommaBasedRangeCheck(GenTree* comma, Statement* stmt); bool optIsRangeCheckRemovable(GenTree* tree); protected: diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 8a7ea5b7bad45..eac58c0326de3 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -12,6 +12,40 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "jitpch.h" +#ifdef DEBUG + +//-------------------------------------------------------------------------------------------------- +// ArrIndex::Print - debug print an ArrIndex struct in form: `V01[V02][V03]`. +// +// Arguments: +// dim (Optional) Print up to but not including this dimension. Default: print all dimensions. +// +void ArrIndex::Print(unsigned dim /* = -1 */) +{ + printf("V%02d", arrLcl); + for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) + { + printf("[V%02d]", indLcls.Get(i)); + } +} + +//-------------------------------------------------------------------------------------------------- +// ArrIndex::PrintBoundsCheckNodes - debug print an ArrIndex struct bounds check node tree ids in +// form: `[000125][000113]`. +// +// Arguments: +// dim (Optional) Print up to but not including this dimension. Default: print all dimensions. +// +void ArrIndex::PrintBoundsCheckNodes(unsigned dim /* = -1 */) +{ + for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) + { + Compiler::printTreeID(bndsChks.Get(i)); + } +} + +#endif // DEBUG + //-------------------------------------------------------------------------------------------------- // ToGenTree - Convert an arrLen operation into a gentree node. // @@ -976,7 +1010,7 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext for (unsigned i = 0; i < optInfos->Size(); ++i) { - LcOptInfo* optInfo = optInfos->GetRef(i); + LcOptInfo* optInfo = optInfos->Get(i); switch (optInfo->GetOptType()) { case LcOptInfo::LcJaggedArray: @@ -985,7 +1019,6 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext LcJaggedArrayOptInfo* arrIndexInfo = optInfo->AsLcJaggedArrayOptInfo(); LC_Array arrLen(LC_Array::Jagged, &arrIndexInfo->arrIndex, arrIndexInfo->dim, LC_Array::ArrLen); LC_Ident arrLenIdent = LC_Ident(arrLen); - LC_Condition cond(GT_LE, LC_Expr(ident), LC_Expr(arrLenIdent)); context->EnsureConditions(loopNum)->Push(cond); @@ -1263,14 +1296,57 @@ void Compiler::optPerformStaticOptimizations(unsigned loopNum, LoopCloneContext* JitExpandArrayStack* optInfos = context->GetLoopOptInfo(loopNum); for (unsigned i = 0; i < optInfos->Size(); ++i) { - LcOptInfo* optInfo = optInfos->GetRef(i); + LcOptInfo* optInfo = optInfos->Get(i); switch (optInfo->GetOptType()) { case LcOptInfo::LcJaggedArray: { LcJaggedArrayOptInfo* arrIndexInfo = optInfo->AsLcJaggedArrayOptInfo(); compCurBB = arrIndexInfo->arrIndex.useBlock; - optRemoveCommaBasedRangeCheck(arrIndexInfo->arrIndex.bndsChks[arrIndexInfo->dim], arrIndexInfo->stmt); + + // Remove all bounds checks for this array up to (and including) `arrIndexInfo->dim`. So, if that is 1, + // Remove rank 0 and 1 bounds checks. + + for (unsigned dim = 0; dim <= arrIndexInfo->dim; dim++) + { +#ifdef DEBUG + if (verbose) + { + printf("Remove bounds check for stmt " FMT_STMT ", dim %d, ", arrIndexInfo->stmt->GetID(), dim); + arrIndexInfo->arrIndex.Print(); + printf(", bounds check nodes: "); + arrIndexInfo->arrIndex.PrintBoundsCheckNodes(); + printf("\n"); + } +#endif // DEBUG + + GenTree* bndsChkNode = arrIndexInfo->arrIndex.bndsChks[dim]; + if (bndsChkNode->gtGetOp1()->OperIsBoundsCheck()) + { + // This COMMA node will only represent a bounds check if we've haven't already removed this + // bounds check in some other nesting cloned loop. For example, consider: + // for (i = 0; i < x; i++) + // for (j = 0; j < y; j++) + // a[i][j] = i + j; + // If the outer loop is cloned first, it will remove the a[i] bounds check from the optimized + // path. Later, when the inner loop is cloned, we want to remove the a[i][j] bounds check. If + // we clone the inner loop, we know that the a[i] bounds check isn't required because we'll add + // it to the loop cloning conditions. On the other hand, we can clone a loop where we get rid of + // the nested bounds check but nobody has gotten rid of the outer bounds check. As before, we know + // the outer bounds check is not needed because it's been added to the cloning conditions, so we + // can get rid of the bounds check here. + // + optRemoveCommaBasedRangeCheck(bndsChkNode, arrIndexInfo->stmt); + } + else + { + JITDUMP(" Bounds check already removed\n"); + + // If the bounds check node isn't there, it better have been converted to a GT_NOP. + assert(bndsChkNode->gtGetOp1()->OperIs(GT_NOP)); + } + } + DBEXEC(dynamicPath, optDebugLogLoopCloning(arrIndexInfo->arrIndex.useBlock, arrIndexInfo->stmt)); } break; @@ -2241,6 +2317,8 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop printTreeID(tree); printf(" which is equivalent to: "); arrIndex.Print(); + printf(", bounds check nodes: "); + arrIndex.PrintBoundsCheckNodes(); printf("\n"); } #endif diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index ecb03234fdcd0..6b4602dab2232 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -220,14 +220,8 @@ struct ArrIndex } #ifdef DEBUG - void Print(unsigned dim = -1) - { - printf("V%02d", arrLcl); - for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) - { - printf("[V%02d]", indLcls.GetRef(i)); - } - } + void Print(unsigned dim = -1); + void PrintBoundsCheckNodes(unsigned dim = -1); #endif }; @@ -260,9 +254,8 @@ struct LcOptInfo #include "loopcloningopts.h" }; - void* optInfo; OptType optType; - LcOptInfo(void* optInfo, OptType optType) : optInfo(optInfo), optType(optType) + LcOptInfo(OptType optType) : optType(optType) { } @@ -270,6 +263,7 @@ struct LcOptInfo { return optType; } + #define LC_OPT(en) \ en##OptInfo* As##en##OptInfo() \ { \ @@ -292,7 +286,7 @@ struct LcMdArrayOptInfo : public LcOptInfo ArrIndex* index; // "index" cached computation in the form of an ArrIndex representation. LcMdArrayOptInfo(GenTreeArrElem* arrElem, unsigned dim) - : LcOptInfo(this, LcMdArray), arrElem(arrElem), dim(dim), index(nullptr) + : LcOptInfo(LcMdArray), arrElem(arrElem), dim(dim), index(nullptr) { } @@ -325,7 +319,7 @@ struct LcJaggedArrayOptInfo : public LcOptInfo Statement* stmt; // "stmt" where the optimization opportunity occurs. LcJaggedArrayOptInfo(ArrIndex& arrIndex, unsigned dim, Statement* stmt) - : LcOptInfo(this, LcJaggedArray), dim(dim), arrIndex(arrIndex), stmt(stmt) + : LcOptInfo(LcJaggedArray), dim(dim), arrIndex(arrIndex), stmt(stmt) { } }; From 74d62df39650f90850c9e93a5aac54b78031fb1e Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sun, 27 Jun 2021 14:44:17 -0700 Subject: [PATCH 04/18] Convert loop cloning data structures to `vector` for better debugging --- src/coreclr/jit/clrjit.natvis | 11 +++++++++++ src/coreclr/jit/loopcloning.h | 34 ++++++++++++++++------------------ 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/clrjit.natvis b/src/coreclr/jit/clrjit.natvis index e4747c5a0731f..b3c187474900f 100644 --- a/src/coreclr/jit/clrjit.natvis +++ b/src/coreclr/jit/clrjit.natvis @@ -190,6 +190,17 @@ Documentation for VS debugger format specifiers: https://docs.microsoft.com/en-u + + size={m_nSize,d} capacity={m_nCapacity,d} + Empty + + + m_nSize + m_pArray + + + + size={m_size,d} Empty diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index 6b4602dab2232..5106df7fd8bad 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -672,30 +672,28 @@ struct LoopCloneContext CompAllocator alloc; // The allocator // The array of optimization opportunities found in each loop. (loop x optimization-opportunities) - JitExpandArrayStack** optInfo; + jitstd::vector*> optInfo; // The array of conditions that influence which path to take for each loop. (loop x cloning-conditions) - JitExpandArrayStack** conditions; + jitstd::vector*> conditions; // The array of dereference conditions found in each loop. (loop x deref-conditions) - JitExpandArrayStack** derefs; + jitstd::vector*> derefs; // The array of block levels of conditions for each loop. (loop x level x conditions) - JitExpandArrayStack*>** blockConditions; - - LoopCloneContext(unsigned loopCount, CompAllocator alloc) : alloc(alloc) - { - optInfo = new (alloc) JitExpandArrayStack*[loopCount]; - conditions = new (alloc) JitExpandArrayStack*[loopCount]; - derefs = new (alloc) JitExpandArrayStack*[loopCount]; - blockConditions = new (alloc) JitExpandArrayStack*>*[loopCount]; - for (unsigned i = 0; i < loopCount; ++i) - { - optInfo[i] = nullptr; - conditions[i] = nullptr; - derefs[i] = nullptr; - blockConditions[i] = nullptr; - } + jitstd::vector*>*> blockConditions; + + LoopCloneContext(unsigned loopCount, CompAllocator alloc) : + alloc(alloc), + optInfo(alloc), + conditions(alloc), + derefs(alloc), + blockConditions(alloc) + { + optInfo.resize(loopCount, nullptr); + conditions.resize(loopCount, nullptr); + derefs.resize(loopCount, nullptr); + blockConditions.resize(loopCount, nullptr); } // Evaluate conditions into a JTRUE stmt and put it in the block. Reverse condition if 'reverse' is true. From 92a792af06803f001aa99150f0fbb1ac8f74dcb1 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 29 Jun 2021 16:33:41 -0700 Subject: [PATCH 05/18] Improve CSE dump output 1. "#if 0" the guts of the CSE dataflow; that's not useful to most people. 2. Add readable CSE number output to the CSE dataflow set output 3. Add FMT_CSE to commonize CSE number output. 4. Add PHASE_OPTIMIZE_VALNUM_CSES to the pre-phase output "allow list" and stop doing its own blocks/trees output. 5. Remove unused optCSECandidateTotal 6. Add functions `getCSEAvailBit` and `getCSEAvailCrossCallBit` to avoid hand-coding these bit calculations in multiple places, for the CSE dataflow set bits. --- src/coreclr/jit/block.h | 5 +- src/coreclr/jit/compiler.h | 58 +++++++-- src/coreclr/jit/compiler.hpp | 19 --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/jitstd/algorithm.h | 4 +- src/coreclr/jit/optcse.cpp | 186 ++++++++++++++++------------- src/coreclr/jit/optimizer.cpp | 9 +- src/coreclr/jit/phase.cpp | 5 +- 8 files changed, 163 insertions(+), 125 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 0cbe51281b353..5d42a1626536f 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -29,7 +29,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "jithashtable.h" /*****************************************************************************/ -typedef BitVec EXPSET_TP; +typedef BitVec EXPSET_TP; +typedef BitVec_ValArg_T EXPSET_VALARG_TP; +typedef BitVec_ValRet_T EXPSET_VALRET_TP; + #if LARGE_EXPSET #define EXPSET_SZ 64 #else diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 2f8cf7062e9d5..79aa8221eb03d 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6700,7 +6700,7 @@ class Compiler // BitVec trait information for computing CSE availability using the CSE_DataFlow algorithm. // Two bits are allocated per CSE candidate to compute CSE availability // plus an extra bit to handle the initial unvisited case. - // (See CSE_DataFlow::EndMerge for an explaination of why this is necessary) + // (See CSE_DataFlow::EndMerge for an explanation of why this is necessary.) // // The two bits per CSE candidate have the following meanings: // 11 - The CSE is available, and is also available when considering calls as killing availability. @@ -6710,6 +6710,37 @@ class Compiler // BitVecTraits* cseLivenessTraits; + //----------------------------------------------------------------------------------------------------------------- + // getCSEnum2bit: Return the normalized index to use in the EXPSET_TP for the CSE with the given CSE index. + // Each GenTree has a `gtCSEnum` field. Zero is reserved to mean this node is not a CSE, positive values indicate + // CSE uses, and negative values indicate CSE defs. The caller must pass a non-zero positive value, as from + // GET_CSE_INDEX(). + // + static unsigned genCSEnum2bit(unsigned CSEnum) + { + assert((CSEnum > 0) && (CSEnum <= MAX_CSE_CNT)); + return CSEnum - 1; + } + + //----------------------------------------------------------------------------------------------------------------- + // getCSEAvailBit: Return the bit used by CSE dataflow sets (bbCseGen, etc.) for the availability bit for a CSE. + // + static unsigned getCSEAvailBit(unsigned CSEnum) + { + return genCSEnum2bit(CSEnum) * 2; + } + + //----------------------------------------------------------------------------------------------------------------- + // getCSEAvailCrossCallBit: Return the bit used by CSE dataflow sets (bbCseGen, etc.) for the availability bit + // for a CSE considering calls as killing availability bit (see description above). + // + static unsigned getCSEAvailCrossCallBit(unsigned CSEnum) + { + return getCSEAvailBit(CSEnum) + 1; + } + + void optPrintCSEDataFlowSet(EXPSET_VALARG_TP cseDataFlowSet, bool includeBits = true); + EXPSET_TP cseCallKillsMask; // Computed once - A mask that is used to kill available CSEs at callsites /* Generic list of nodes - used by the CSE logic */ @@ -6844,9 +6875,12 @@ class Compiler return (enckey & ~TARGET_SIGN_BIT) << CSE_CONST_SHARED_LOW_BITS; } - /************************************************************************** - * Value Number based CSEs - *************************************************************************/ +/************************************************************************** + * Value Number based CSEs + *************************************************************************/ + +// String to use for formatting CSE numbers. Note that this is the positive number, e.g., from GET_CSE_INDEX(). +#define FMT_CSE "CSE #%02u" public: void optOptimizeValnumCSEs(); @@ -6854,16 +6888,15 @@ class Compiler protected: void optValnumCSE_Init(); unsigned optValnumCSE_Index(GenTree* tree, Statement* stmt); - unsigned optValnumCSE_Locate(); - void optValnumCSE_InitDataFlow(); - void optValnumCSE_DataFlow(); - void optValnumCSE_Availablity(); - void optValnumCSE_Heuristic(); + bool optValnumCSE_Locate(); + void optValnumCSE_InitDataFlow(); + void optValnumCSE_DataFlow(); + void optValnumCSE_Availablity(); + void optValnumCSE_Heuristic(); bool optDoCSE; // True when we have found a duplicate CSE tree - bool optValnumCSE_phase; // True when we are executing the optValnumCSE_phase - unsigned optCSECandidateTotal; // Grand total of CSE candidates for both Lexical and ValNum - unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's + bool optValnumCSE_phase; // True when we are executing the optOptimizeValnumCSEs() phase + unsigned optCSECandidateCount; // Count of CSE's candidates unsigned optCSEstart; // The first local variable number that is a CSE unsigned optCSEcount; // The total count of CSE's introduced. BasicBlock::weight_t optCSEweight; // The weight of the current block when we are doing PerformCSE @@ -6888,6 +6921,7 @@ class Compiler bool optConfigDisableCSE(); bool optConfigDisableCSE2(); #endif + void optOptimizeCSEs(); struct isVarAssgDsc diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index d05bfc6bbeb9f..ca9626ee11c1f 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -763,25 +763,6 @@ inline double getR8LittleEndian(const BYTE* ptr) return *(double*)&val; } -/***************************************************************************** - * - * Return the normalized index to use in the EXPSET_TP for the CSE with - * the given CSE index. - * Each GenTree has the following field: - * signed char gtCSEnum; // 0 or the CSE index (negated if def) - * So zero is reserved to mean this node is not a CSE - * and postive values indicate CSE uses and negative values indicate CSE defs. - * The caller of this method must pass a non-zero postive value. - * This precondition is checked by the assert on the first line of this method. - */ - -inline unsigned int genCSEnum2bit(unsigned index) -{ - assert((index > 0) && (index <= EXPSET_SZ)); - - return (index - 1); -} - #ifdef DEBUG const char* genES2str(BitVecTraits* traits, EXPSET_TP set); const char* refCntWtd2str(BasicBlock::weight_t refCntWtd); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b56372d18d28f..ae8012322ddef 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -10291,7 +10291,7 @@ void Compiler::gtDispNode(GenTree* tree, IndentStack* indentStack, __in __in_z _ { if (IS_CSE_INDEX(tree->gtCSEnum)) { - printf("CSE #%02d (%s)", GET_CSE_INDEX(tree->gtCSEnum), (IS_CSE_USE(tree->gtCSEnum) ? "use" : "def")); + printf(FMT_CSE " (%s)", GET_CSE_INDEX(tree->gtCSEnum), (IS_CSE_USE(tree->gtCSEnum) ? "use" : "def")); } else { diff --git a/src/coreclr/jit/jitstd/algorithm.h b/src/coreclr/jit/jitstd/algorithm.h index 000639a5a1de8..9fa6fbb94dd54 100644 --- a/src/coreclr/jit/jitstd/algorithm.h +++ b/src/coreclr/jit/jitstd/algorithm.h @@ -102,9 +102,9 @@ void quick_sort(RandomAccessIterator first, RandomAccessIterator last, Less less // // It's not possible for newFirst to go past the end of the sort range: // - If newFirst reaches the pivot before newLast then the pivot is - // swapped to the right and we'll stop again when we reach it. + // swapped to the right and we'll stop again when we reach it. // - If newLast reaches the pivot before newFirst then the pivot is - // swapped to the left and the value at newFirst will take its place + // swapped to the left and the value at newFirst will take its place // to the right so less(newFirst, pivot) will again be false when the // old pivot's position is reached. do diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index b4137ec52ebbf..587349e489b55 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -166,7 +166,8 @@ Compiler::fgWalkResult Compiler::optCSE_MaskHelper(GenTree** pTree, fgWalkData* if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned cseIndex = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseBit = genCSEnum2bit(cseIndex); + // Note that we DO NOT use getCSEAvailBit() here, for the CSE_defMask/CSE_useMask + unsigned cseBit = genCSEnum2bit(cseIndex); if (IS_CSE_DEF(tree->gtCSEnum)) { BitVecOps::AddElemD(comp->cseMaskTraits, pUserData->CSE_defMask, cseBit); @@ -424,16 +425,16 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) ValueNum vnLibNorm = vnStore->VNNormalValue(vnLib); // We use the normal value number because we want the CSE candidate to - // represent all expressions that produce the same normal value number + // represent all expressions that produce the same normal value number. // We will handle the case where we have different exception sets when // promoting the candidates. // // We do this because a GT_IND will usually have a NullPtrExc entry in its // exc set, but we may have cleared the GTF_EXCEPT flag and if so, it won't - // have an NullPtrExc, or we may have assigned the value of an GT_IND + // have an NullPtrExc, or we may have assigned the value of an GT_IND // into a LCL_VAR and then read it back later. // - // When we are promoting the CSE candidates we insure that any CSE + // When we are promoting the CSE candidates we ensure that any CSE // uses that we promote have an exc set that is the same as the CSE defs // or have an empty set. And that all of the CSE defs produced the required // set of exceptions for the CSE uses. @@ -502,7 +503,7 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) key = vnLibNorm; } - // Make sure that the result of Is_Shared_Const_CSE(key) matches isSharedConst + // Make sure that the result of Is_Shared_Const_CSE(key) matches isSharedConst. // Note that when isSharedConst is true then we require that the TARGET_SIGN_BIT is set in the key // and otherwise we require that we never create a ValueNumber with the TARGET_SIGN_BIT set. // @@ -709,7 +710,6 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) C_ASSERT((signed char)MAX_CSE_CNT == MAX_CSE_CNT); unsigned CSEindex = ++optCSECandidateCount; - // EXPSET_TP CSEmask = genCSEnum2bit(CSEindex); /* Record the new CSE index in the hashDsc */ hashDsc->csdIndex = CSEindex; @@ -746,16 +746,14 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) } } -/***************************************************************************** - * - * Locate CSE candidates and assign indices to them - * return 0 if no CSE candidates were found - */ - -unsigned Compiler::optValnumCSE_Locate() +//------------------------------------------------------------------------ +// optValnumCSE_Locate: Locate CSE candidates and assign them indices. +// +// Returns: +// true if there are any CSE candidates, false otherwise +// +bool Compiler::optValnumCSE_Locate() { - // Locate CSE candidates and assign them indices - bool enableConstCSE = true; int configValue = JitConfig.JitConstCSE(); @@ -871,14 +869,14 @@ unsigned Compiler::optValnumCSE_Locate() if (!optDoCSE) { - return 0; + return false; } /* We're finished building the expression lookup table */ optCSEstop(); - return 1; + return true; } //------------------------------------------------------------------------ @@ -890,7 +888,7 @@ unsigned Compiler::optValnumCSE_Locate() // // Arguments: // compare - The compare node to check - +// void Compiler::optCseUpdateCheckedBoundMap(GenTree* compare) { assert(compare->OperIsCompare()); @@ -999,9 +997,9 @@ void Compiler::optValnumCSE_InitDataFlow() // Init traits and cseCallKillsMask bitvectors. cseLivenessTraits = new (getAllocator(CMK_CSE)) BitVecTraits(bitCount, this); cseCallKillsMask = BitVecOps::MakeEmpty(cseLivenessTraits); - for (unsigned inx = 0; inx < optCSECandidateCount; inx++) + for (unsigned inx = 1; inx <= optCSECandidateCount; inx++) { - unsigned cseAvailBit = inx * 2; + unsigned cseAvailBit = getCSEAvailBit(inx); // a one preserves availability and a zero kills the availability // we generate this kind of bit pattern: 101010101010 @@ -1047,7 +1045,7 @@ void Compiler::optValnumCSE_InitDataFlow() block->bbCseGen = BitVecOps::MakeEmpty(cseLivenessTraits); } - // We walk the set of CSE candidates and set the bit corresponsing to the CSEindex + // We walk the set of CSE candidates and set the bit corresponding to the CSEindex // in the block's bbCseGen bitset // for (unsigned inx = 0; inx < optCSECandidateCount; inx++) @@ -1060,16 +1058,16 @@ void Compiler::optValnumCSE_InitDataFlow() while (lst != nullptr) { BasicBlock* block = lst->tslBlock; - unsigned CseAvailBit = genCSEnum2bit(CSEindex) * 2; - unsigned cseAvailCrossCallBit = CseAvailBit + 1; + unsigned cseAvailBit = getCSEAvailBit(CSEindex); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEindex); - // This CSE is generated in 'block', we always set the CseAvailBit + // This CSE is generated in 'block', we always set the cseAvailBit // If this block does not contain a call, we also set cseAvailCrossCallBit // // If we have a call in this block then in the loop below we walk the trees // backwards to find any CSEs that are generated after the last call in the block. // - BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, CseAvailBit); + BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailBit); if ((block->bbFlags & BBF_HAS_CALL) == 0) { BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); @@ -1113,7 +1111,7 @@ void Compiler::optValnumCSE_InitDataFlow() if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); } if (tree->OperGet() == GT_CALL) @@ -1142,15 +1140,16 @@ void Compiler::optValnumCSE_InitDataFlow() bool headerPrinted = false; for (BasicBlock* const block : Blocks()) { - if (block->bbCseGen != nullptr) + if (!BitVecOps::IsEmpty(cseLivenessTraits, block->bbCseGen)) { if (!headerPrinted) { printf("\nBlocks that generate CSE def/uses\n"); headerPrinted = true; } - printf(FMT_BB, block->bbNum); - printf(" cseGen = %s\n", genES2str(cseLivenessTraits, block->bbCseGen)); + printf(FMT_BB " cseGen = ", block->bbNum); + optPrintCSEDataFlowSet(block->bbCseGen); + printf("\n"); } } } @@ -1184,6 +1183,7 @@ class CSE_DataFlow // BitVecOps::Assign(m_comp->cseLivenessTraits, m_preMergeOut, block->bbCseOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1191,11 +1191,13 @@ class CSE_DataFlow printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); } #endif // DEBUG +#endif // 0 } // Merge: perform the merging of each of the predecessor's liveness values (since this is a forward analysis) void Merge(BasicBlock* block, BasicBlock* predBlock, unsigned dupCount) { +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1204,15 +1206,18 @@ class CSE_DataFlow printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); } #endif // DEBUG +#endif // 0 BitVecOps::IntersectionD(m_comp->cseLivenessTraits, block->bbCseIn, predBlock->bbCseOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { printf(" => cseIn = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseIn)); } #endif // DEBUG +#endif // 0 } //------------------------------------------------------------------------ @@ -1272,6 +1277,7 @@ class CSE_DataFlow // bool notDone = !BitVecOps::Equal(m_comp->cseLivenessTraits, block->bbCseOut, m_preMergeOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1288,6 +1294,7 @@ class CSE_DataFlow notDone ? "true" : "false"); } #endif // DEBUG +#endif // 0 return notDone; } @@ -1330,10 +1337,12 @@ void Compiler::optValnumCSE_DataFlow() for (BasicBlock* const block : Blocks()) { - printf(FMT_BB, block->bbNum); - printf(" cseIn = %s,", genES2str(cseLivenessTraits, block->bbCseIn)); - printf(" cseGen = %s,", genES2str(cseLivenessTraits, block->bbCseGen)); - printf(" cseOut = %s", genES2str(cseLivenessTraits, block->bbCseOut)); + printf(FMT_BB " in gen out\n", block->bbNum); + optPrintCSEDataFlowSet(block->bbCseIn); + printf("\n"); + optPrintCSEDataFlowSet(block->bbCseGen); + printf("\n"); + optPrintCSEDataFlowSet(block->bbCseOut); printf("\n"); } @@ -1347,17 +1356,17 @@ void Compiler::optValnumCSE_DataFlow() // // Using the information computed by CSE_DataFlow determine for each // CSE whether the CSE is a definition (if the CSE was not available) -// or if the CSE is a use (if the CSE was previously made available) -// The implementation iterates of all blocks setting 'available_cses' +// or if the CSE is a use (if the CSE was previously made available). +// The implementation iterates over all blocks setting 'available_cses' // to the CSEs that are available at input to the block. // When a CSE expression is encountered it is classified as either // as a definition (if the CSE is not in the 'available_cses' set) or -// as a use (if the CSE is in the 'available_cses' set). If the CSE +// as a use (if the CSE is in the 'available_cses' set). If the CSE // is a definition then it is added to the 'available_cses' set. // // This algorithm uncovers the defs and uses gradually and as it does // so it also builds the exception set that all defs make: 'defExcSetCurrent' -// and the exception set that the uses we have seen depend upon: 'defExcSetPromise' +// and the exception set that the uses we have seen depend upon: 'defExcSetPromise'. // // Typically expressions with the same normal ValueNum generate exactly the // same exception sets. There are two way that we can get different exception @@ -1371,12 +1380,11 @@ void Compiler::optValnumCSE_DataFlow() // 2. We stored an expression into a LclVar or into Memory and read it later // e.g. t = p.a; // e1 = (t + q.b) :: e1 has one NullPtrExc and e2 has two. -// e2 = (p.a + q.b) but both compute the same normal value// +// e2 = (p.a + q.b) but both compute the same normal value // e.g. m.a = p.a; // e1 = (m.a + q.b) :: e1 and e2 have different exception sets. // e2 = (p.a + q.b) but both compute the same normal value // -// void Compiler::optValnumCSE_Availablity() { #ifdef DEBUG @@ -1411,12 +1419,12 @@ void Compiler::optValnumCSE_Availablity() if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned CseAvailBit = genCSEnum2bit(CSEnum) * 2; - unsigned cseAvailCrossCallBit = CseAvailBit + 1; + unsigned cseAvailBit = getCSEAvailBit(CSEnum); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); CSEdsc* desc = optCSEfindDsc(CSEnum); BasicBlock::weight_t stmw = block->getBBWeight(this); - isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, CseAvailBit); + isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, cseAvailBit); isDef = !isUse; // If is isn't a CSE use, it is a CSE def // Is this a "use", that we haven't yet marked as live across a call @@ -1446,7 +1454,7 @@ void Compiler::optValnumCSE_Availablity() printf(FMT_BB " ", block->bbNum); printTreeID(tree); - printf(" %s of CSE #%02u [weight=%s]%s\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw), + printf(" %s of " FMT_CSE " [weight=%s]%s\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw), madeLiveAcrossCall ? " *** Now Live Across Call ***" : ""); } #endif // DEBUG @@ -1477,7 +1485,7 @@ void Compiler::optValnumCSE_Availablity() // Is defExcSetCurrent still set to the uninit marker value of VNForNull() ? if (desc->defExcSetCurrent == vnStore->VNForNull()) { - // This is the first time visited, so record this defs exeception set + // This is the first time visited, so record this defs exception set desc->defExcSetCurrent = theLiberalExcSet; } @@ -1589,7 +1597,7 @@ void Compiler::optValnumCSE_Availablity() tree->gtCSEnum = TO_CSE_DEF(tree->gtCSEnum); // This CSE becomes available after this def - BitVecOps::AddElemD(cseLivenessTraits, available_cses, CseAvailBit); + BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailBit); BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); } else // We are visiting a CSE use @@ -1636,7 +1644,7 @@ void Compiler::optValnumCSE_Availablity() if (!vnStore->VNExcIsSubset(desc->defExcSetPromise, theLiberalExcSet)) { // We can't safely make this into a CSE use, because this - // CSE use has an exeception set item that is not promised + // CSE use has an exception set item that is not promised // by all of our CSE defs. // // We will omit this CSE use from the graph and proceed, @@ -1660,7 +1668,7 @@ void Compiler::optValnumCSE_Availablity() // In order to determine if a CSE is live across a call, we model availablity using two bits and // kill all of the cseAvailCrossCallBit for each CSE whenever we see a GT_CALL (unless the call - // generates A cse) + // generates a CSE). // if (tree->OperGet() == GT_CALL) { @@ -1690,7 +1698,7 @@ void Compiler::optValnumCSE_Availablity() // available_cses // unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); } @@ -2027,14 +2035,14 @@ class CSE_Heuristic if (!Compiler::Is_Shared_Const_CSE(dsc->csdHashKey)) { - printf("CSE #%02u, {$%-3x, $%-3x} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", + printf(FMT_CSE ", {$%-3x, $%-3x} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dsc->csdHashKey, dsc->defExcSetPromise, dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } else { size_t kVal = Compiler::Decode_Shared_Const_CSE_Value(dsc->csdHashKey); - printf("CSE #%02u, {K_%p} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, + printf(FMT_CSE ", {K_%p} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dspPtr(kVal), dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } @@ -2814,7 +2822,7 @@ class CSE_Heuristic if (dsc->csdDefCount == 1) { - JITDUMP("CSE #%02u is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex, + JITDUMP(FMT_CSE " is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex, cseLclVarNum); m_pCompiler->lvaTable[cseLclVarNum].lvInSsa = true; @@ -2931,7 +2939,7 @@ class CSE_Heuristic if (IS_CSE_INDEX(lst->tslTree->gtCSEnum)) { ValueNum currVN = m_pCompiler->vnStore->VNLiberalNormalValue(lst->tslTree->gtVNPair); - printf("0x%x(%s " FMT_VN ") ", lst->tslTree, + printf("[%06d](%s " FMT_VN ") ", m_pCompiler->dspTreeID(lst->tslTree), IS_CSE_USE(lst->tslTree->gtCSEnum) ? "use" : "def", currVN); } lst = lst->tslNext; @@ -2996,7 +3004,7 @@ class CSE_Heuristic #ifdef DEBUG if (m_pCompiler->verbose) { - printf("\nWorking on the replacement of the CSE #%02u use at ", exp->gtCSEnum); + printf("\nWorking on the replacement of the " FMT_CSE " use at ", exp->gtCSEnum); Compiler::printTreeID(exp); printf(" in " FMT_BB "\n", blk->bbNum); } @@ -3164,7 +3172,7 @@ class CSE_Heuristic #ifdef DEBUG if (m_pCompiler->verbose) { - printf("\nCSE #%02u def at ", GET_CSE_INDEX(exp->gtCSEnum)); + printf("\n" FMT_CSE " def at ", GET_CSE_INDEX(exp->gtCSEnum)); Compiler::printTreeID(exp); printf(" replaced in " FMT_BB " with def of V%02u\n", blk->bbNum, cseLclVarNum); } @@ -3314,13 +3322,13 @@ class CSE_Heuristic if (dsc->defExcSetPromise == ValueNumStore::NoVN) { - JITDUMP("Abandoned CSE #%02u because we had defs with different Exc sets\n", candidate.CseIndex()); + JITDUMP("Abandoned " FMT_CSE " because we had defs with different Exc sets\n", candidate.CseIndex()); continue; } if (dsc->csdStructHndMismatch) { - JITDUMP("Abandoned CSE #%02u because we had mismatching struct handles\n", candidate.CseIndex()); + JITDUMP("Abandoned " FMT_CSE " because we had mismatching struct handles\n", candidate.CseIndex()); continue; } @@ -3328,7 +3336,7 @@ class CSE_Heuristic if (candidate.UseCount() == 0) { - JITDUMP("Skipped CSE #%02u because use count is 0\n", candidate.CseIndex()); + JITDUMP("Skipped " FMT_CSE " because use count is 0\n", candidate.CseIndex()); continue; } @@ -3337,14 +3345,14 @@ class CSE_Heuristic { if (!Compiler::Is_Shared_Const_CSE(dsc->csdHashKey)) { - printf("\nConsidering CSE #%02u {$%-3x, $%-3x} [def=%3f, use=%3f, cost=%3u%s]\n", + printf("\nConsidering " FMT_CSE " {$%-3x, $%-3x} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), dsc->csdHashKey, dsc->defExcSetPromise, candidate.DefCount(), candidate.UseCount(), candidate.Cost(), dsc->csdLiveAcrossCall ? ", call" : " "); } else { size_t kVal = Compiler::Decode_Shared_Const_CSE_Value(dsc->csdHashKey); - printf("\nConsidering CSE #%02u {K_%p} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), + printf("\nConsidering " FMT_CSE " {K_%p} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), dspPtr(kVal), candidate.DefCount(), candidate.UseCount(), candidate.Cost(), dsc->csdLiveAcrossCall ? ", call" : " "); } @@ -3428,11 +3436,6 @@ void Compiler::optValnumCSE_Heuristic() void Compiler::optOptimizeValnumCSEs() { #ifdef DEBUG - if (verbose) - { - printf("\n*************** In optOptimizeValnumCSEs()\n"); - } - if (optConfigDisableCSE()) { return; // Disabled by JitNoCSE @@ -3441,22 +3444,13 @@ void Compiler::optOptimizeValnumCSEs() optValnumCSE_phase = true; - /* Initialize the expression tracking logic */ - optValnumCSE_Init(); - /* Locate interesting expressions and assign indices to them */ - - if (optValnumCSE_Locate() > 0) + if (optValnumCSE_Locate()) { - optCSECandidateTotal += optCSECandidateCount; - optValnumCSE_InitDataFlow(); - optValnumCSE_DataFlow(); - optValnumCSE_Availablity(); - optValnumCSE_Heuristic(); } @@ -3807,21 +3801,11 @@ bool Compiler::optConfigDisableCSE2() void Compiler::optOptimizeCSEs() { -#ifdef DEBUG - if (verbose) - { - printf("\n*************** In optOptimizeCSEs()\n"); - printf("Blocks/Trees at start of optOptimizeCSE phase\n"); - fgDispBasicBlocks(true); - } -#endif // DEBUG - optCSECandidateCount = 0; optCSEstart = lvaCount; INDEBUG(optEnsureClearCSEInfo()); optOptimizeValnumCSEs(); - EndPhase(PHASE_OPTIMIZE_VALNUM_CSES); } /***************************************************************************** @@ -3873,4 +3857,38 @@ void Compiler::optEnsureClearCSEInfo() } } +//------------------------------------------------------------------------ +// optPrintCSEDataFlowSet: Print out one of the CSE dataflow sets bbCseGen, bbCseIn, bbCseOut, +// interpreting the bits in a more useful way for the dump. +// +// Arguments: +// cseDataFlowSet - One of the dataflow sets to display +// includeBits - Display the actual bits of the set as well +// +void Compiler::optPrintCSEDataFlowSet(EXPSET_VALARG_TP cseDataFlowSet, bool includeBits /* = true */) +{ + if (includeBits) + { + printf("%s ", genES2str(cseLivenessTraits, cseDataFlowSet)); + } + + bool first = true; + for (unsigned cseIndex = 1; cseIndex <= optCSECandidateCount; cseIndex++) + { + unsigned cseAvailBit = getCSEAvailBit(cseIndex); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(cseIndex); + + if (BitVecOps::IsMember(cseLivenessTraits, cseDataFlowSet, cseAvailBit)) + { + if (!first) + { + printf(", "); + } + const bool isAvailCrossCall = BitVecOps::IsMember(cseLivenessTraits, cseDataFlowSet, cseAvailCrossCallBit); + printf(FMT_CSE "%s", cseIndex, isAvailCrossCall ? ".c" : ""); + first = false; + } + } +} + #endif // DEBUG diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a7df5b95905a2..748d5feabb562 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -38,7 +38,6 @@ void Compiler::optInit() optNativeCallCount = 0; optAssertionCount = 0; optAssertionDep = nullptr; - optCSECandidateTotal = 0; optCSEstart = UINT_MAX; optCSEcount = 0; } @@ -5627,8 +5626,8 @@ void Compiler::optHoistLoopCode() #endif #if 0 - // The code in this #if has been useful in debugging loop cloning issues, by - // enabling selective enablement of the loop cloning optimization according to + // The code in this #if has been useful in debugging loop hoisting issues, by + // enabling selective enablement of the loop hoisting optimization according to // method hash. #ifdef DEBUG unsigned methHash = info.compMethodHash(); @@ -5650,7 +5649,7 @@ void Compiler::optHoistLoopCode() return; printf("Doing loop hoisting in %s (0x%x).\n", info.compFullName, methHash); #endif // DEBUG -#endif // 0 -- debugging loop cloning issues +#endif // 0 -- debugging loop hoisting issues #ifdef DEBUG if (verbose) @@ -5899,6 +5898,8 @@ void Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) printf("\n LOOPV-FP(%d)=", pLoopDsc->lpLoopVarFPCount); lvaDispVarSet(loopFPVars); + + printf("\n"); } #endif } diff --git a/src/coreclr/jit/phase.cpp b/src/coreclr/jit/phase.cpp index 530f8e74cbba0..f08134bf11532 100644 --- a/src/coreclr/jit/phase.cpp +++ b/src/coreclr/jit/phase.cpp @@ -84,8 +84,9 @@ void Phase::PrePhase() // // Currently the list is just the set of phases that have custom // derivations from the Phase class. - static Phases s_allowlist[] = {PHASE_BUILD_SSA, PHASE_RATIONALIZE, PHASE_LOWERING, PHASE_STACK_LEVEL_SETTER}; - bool doPrePhase = false; + static Phases s_allowlist[] = {PHASE_BUILD_SSA, PHASE_OPTIMIZE_VALNUM_CSES, PHASE_RATIONALIZE, PHASE_LOWERING, + PHASE_STACK_LEVEL_SETTER}; + bool doPrePhase = false; for (size_t i = 0; i < sizeof(s_allowlist) / sizeof(Phases); i++) { From 589be444e7f8d8abde7a873b25d1dcd842615afc Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 29 Jun 2021 16:40:23 -0700 Subject: [PATCH 06/18] Mark cloned array indexes as non-faulting When generating loop cloning conditions, mark array index expressions as non-faulting, as we have already null- and range-checked the array before generating an index expression. I also added similary code to mark array length expressions as non-faulting, for the same reason. However, that leads to CQ losses because of downstream CSE effects. --- src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/loopcloning.cpp | 27 +++++++++++++++++++-------- src/coreclr/jit/loopcloning.h | 8 ++------ src/coreclr/jit/morph.cpp | 11 ++++++----- 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b043956468848..f88f3c2f775fc 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -494,6 +494,7 @@ enum GenTreeFlags : unsigned int GTF_INX_RNGCHK = 0x80000000, // GT_INDEX/GT_INDEX_ADDR -- the array reference should be range-checked. GTF_INX_STRING_LAYOUT = 0x40000000, // GT_INDEX -- this uses the special string array layout + GTF_INX_NONFAULTING = 0x20000000, // GT_INDEX -- the INDEX does not throw an exception (morph to GTF_IND_NONFAULTING) GTF_IND_TGT_NOT_HEAP = 0x80000000, // GT_IND -- the target is not on the heap GTF_IND_VOLATILE = 0x40000000, // GT_IND -- the load or store must use volatile sematics (this is a nop on X86) diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index eac58c0326de3..d7519568bc75b 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -74,14 +74,25 @@ GenTree* LC_Array::ToGenTree(Compiler* comp, BasicBlock* bb) arr = comp->gtNewIndexRef(TYP_REF, arr, comp->gtNewLclvNode(arrIndex->indLcls[i], comp->lvaTable[arrIndex->indLcls[i]].lvType)); - // Clear the range check flag: we guarantee that all necessary range checking has already - // been done by the time this array index expression is invoked. - arr->gtFlags &= ~GTF_INX_RNGCHK; + // Clear the range check flag and mark the index as non-faulting: we guarantee that all necessary range + // checking has already been done by the time this array index expression is invoked. + arr->gtFlags &= ~(GTF_INX_RNGCHK | GTF_EXCEPT); + arr->gtFlags |= GTF_INX_NONFAULTING; } // If asked for arrlen invoke arr length operator. if (oper == ArrLen) { GenTree* arrLen = comp->gtNewArrLen(TYP_INT, arr, OFFSETOF__CORINFO_Array__length, bb); + + // We already guaranteed (by a sequence of preceding checks) that the array length operator will not + // throw an exception because we null checked the base array. + // So, we should be able to do the following: + // arrLen->gtFlags &= ~GTF_EXCEPT; + // arrLen->gtFlags |= GTF_IND_NONFAULTING; + // However, we then end up with a mix of non-faulting array length operators as well as normal faulting + // array length operators in the slow-path of the cloned loops. CSE doesn't keep these separate, so bails + // out on creating CSEs on this very useful type of CSE, leading to CQ losses in the cloned loop fast path. + // TODO-CQ: fix this. return arrLen; } else @@ -1017,8 +1028,8 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext { // limit <= arrLen LcJaggedArrayOptInfo* arrIndexInfo = optInfo->AsLcJaggedArrayOptInfo(); - LC_Array arrLen(LC_Array::Jagged, &arrIndexInfo->arrIndex, arrIndexInfo->dim, LC_Array::ArrLen); - LC_Ident arrLenIdent = LC_Ident(arrLen); + LC_Array arrLen(LC_Array::Jagged, &arrIndexInfo->arrIndex, arrIndexInfo->dim, LC_Array::ArrLen); + LC_Ident arrLenIdent = LC_Ident(arrLen); LC_Condition cond(GT_LE, LC_Expr(ident), LC_Expr(arrLenIdent)); context->EnsureConditions(loopNum)->Push(cond); @@ -1332,9 +1343,9 @@ void Compiler::optPerformStaticOptimizations(unsigned loopNum, LoopCloneContext* // path. Later, when the inner loop is cloned, we want to remove the a[i][j] bounds check. If // we clone the inner loop, we know that the a[i] bounds check isn't required because we'll add // it to the loop cloning conditions. On the other hand, we can clone a loop where we get rid of - // the nested bounds check but nobody has gotten rid of the outer bounds check. As before, we know - // the outer bounds check is not needed because it's been added to the cloning conditions, so we - // can get rid of the bounds check here. + // the nested bounds check but nobody has gotten rid of the outer bounds check. As before, we + // know the outer bounds check is not needed because it's been added to the cloning conditions, + // so we can get rid of the bounds check here. // optRemoveCommaBasedRangeCheck(bndsChkNode, arrIndexInfo->stmt); } diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index 5106df7fd8bad..c5ed53c31bad8 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -683,12 +683,8 @@ struct LoopCloneContext // The array of block levels of conditions for each loop. (loop x level x conditions) jitstd::vector*>*> blockConditions; - LoopCloneContext(unsigned loopCount, CompAllocator alloc) : - alloc(alloc), - optInfo(alloc), - conditions(alloc), - derefs(alloc), - blockConditions(alloc) + LoopCloneContext(unsigned loopCount, CompAllocator alloc) + : alloc(alloc), optInfo(alloc), conditions(alloc), derefs(alloc), blockConditions(alloc) { optInfo.resize(loopCount, nullptr); conditions.resize(loopCount, nullptr); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 2b40d8548c68f..c8dec1c1da1ad 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -5577,8 +5577,9 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) GenTree* arrRef = asIndex->Arr(); GenTree* index = asIndex->Index(); - bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled - bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); + bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled + bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NONFAULTING) != 0); // if true, mark GTF_IND_NONFAULTING + bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); GenTree* arrRefDefn = nullptr; // non-NULL if we need to allocate a temp for the arrRef expression GenTree* indexDefn = nullptr; // non-NULL if we need to allocate a temp for the index expression @@ -5738,9 +5739,9 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) this->compFloatingPointUsed = true; } - // We've now consumed the GTF_INX_RNGCHK, and the node + // We've now consumed the GTF_INX_RNGCHK and GTF_INX_NONFAULTING, and the node // is no longer a GT_INDEX node. - tree->gtFlags &= ~GTF_INX_RNGCHK; + tree->gtFlags &= ~(GTF_INX_RNGCHK | GTF_INX_NONFAULTING); tree->AsOp()->gtOp1 = addr; @@ -5748,7 +5749,7 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) tree->gtFlags |= GTF_IND_ARR_INDEX; // If there's a bounds check, the indir won't fault. - if (bndsChk) + if (bndsChk || indexNonFaulting) { tree->gtFlags |= GTF_IND_NONFAULTING; } From f45028f0b89e9bf1c87f8aa9fb2b3366400ca193 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 30 Jun 2021 08:40:01 -0700 Subject: [PATCH 07/18] Don't count zero-sized align instructions in PerfScore --- src/coreclr/jit/emit.cpp | 10 +++++----- src/coreclr/jit/emit.h | 2 ++ src/coreclr/jit/emitxarch.cpp | 11 +++++++++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 352d1b2acfcd7..b687146ca2cd9 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1184,7 +1184,7 @@ int emitter::instrDesc::idAddrUnion::iiaGetJitDataOffset() const //---------------------------------------------------------------------------------------- // insEvaluateExecutionCost: -// Returns the estimate execution cost fortyhe current instruction +// Returns the estimated execution cost for the current instruction // // Arguments: // id - The current instruction descriptor to be evaluated @@ -1193,8 +1193,6 @@ int emitter::instrDesc::idAddrUnion::iiaGetJitDataOffset() const // calls getInsExecutionCharacteristics and uses the result // to compute an estimated execution cost // -// Notes: -// float emitter::insEvaluateExecutionCost(instrDesc* id) { insExecutionCharacteristics result = getInsExecutionCharacteristics(id); @@ -1202,8 +1200,10 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) float latency = result.insLatency; unsigned memAccessKind = result.insMemoryAccessKind; - // Check for PERFSCORE_THROUGHPUT_ILLEGAL and PERFSCORE_LATENCY_ILLEGAL - assert(throughput > 0.0); + // Check for PERFSCORE_THROUGHPUT_ILLEGAL and PERFSCORE_LATENCY_ILLEGAL. + // Note that 0.0 throughput is allowed for pseudo-instructions in the instrDesc list that won't actually + // generate code. + assert(throughput >= 0.0); assert(latency >= 0.0); if (memAccessKind == PERFSCORE_MEMORY_WRITE) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 666201fc98bf0..c2ea67869acbc 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1229,6 +1229,8 @@ class emitter #define PERFSCORE_THROUGHPUT_ILLEGAL -1024.0f +#define PERFSCORE_THROUGHPUT_ZERO 0.0f // Only used for pseudo-instructions that don't generate code + #define PERFSCORE_THROUGHPUT_6X (1.0f / 6.0f) // Hextuple issue #define PERFSCORE_THROUGHPUT_5X 0.20f // Pentuple issue #define PERFSCORE_THROUGHPUT_4X 0.25f // Quad issue diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index a0a5e3283d4e9..f4c5d30256710 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -14669,6 +14669,17 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (ins) { case INS_align: +#if FEATURE_LOOP_ALIGN + if (id->idCodeSize() == 0) + { + // We're not going to generate any instruction, so it doesn't count for PerfScore. + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; + result.insLatency = PERFSCORE_LATENCY_ZERO; + break; + } +#endif + FALLTHROUGH; + case INS_nop: case INS_int3: assert(memFmt == IF_NONE); From 0efd5c45f98424d952992e2101452db245c0bacb Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Thu, 1 Jul 2021 15:08:52 -0700 Subject: [PATCH 08/18] Add COMPlus_JitDasmWithAlignmentBoundaries This outputs the alignment boundaries without requiring outputting the actual addresses. It makes it easier to diff changes. --- src/coreclr/jit/compiler.cpp | 6 ++++++ src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/emit.cpp | 4 ++-- src/coreclr/jit/jitconfigvalues.h | 3 +++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 7019d8afad6c5..bf49e63ab0cbb 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2988,6 +2988,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.disAsmSpilled = false; opts.disDiffable = false; opts.disAddr = false; + opts.disAlignment = false; opts.dspCode = false; opts.dspEHTable = false; opts.dspDebugInfo = false; @@ -3136,6 +3137,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.disAddr = true; } + if (JitConfig.JitDasmWithAlignmentBoundaries() != 0) + { + opts.disAlignment = true; + } + if (JitConfig.JitLongAddress() != 0) { opts.compLongAddress = true; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 79aa8221eb03d..414bed4fdc914 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9338,6 +9338,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool disasmWithGC; // Display GC info interleaved with disassembly. bool disDiffable; // Makes the Disassembly code 'diff-able' bool disAddr; // Display process address next to each instruction in disassembly code + bool disAlignment; // Display alignment boundaries in disassembly code bool disAsm2; // Display native code after it is generated using external disassembler bool dspOrder; // Display names of each of the methods that we ngen/jit bool dspUnwind; // Display the unwind info output diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index b687146ca2cd9..44bec2c75fdf6 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -3824,7 +3824,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) } // Print the alignment boundary - if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr) + if ((emitComp->opts.disAsm || emitComp->verbose) && (emitComp->opts.disAddr || emitComp->opts.disAlignment)) { size_t currAddr = (size_t)*dp; size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1); @@ -3867,7 +3867,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary)); } } -#endif +#endif // DEBUG return is; } diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index cc10adc0b8809..4c573b7575257 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -68,6 +68,9 @@ CONFIG_INTEGER(JitAlignLoopAdaptive, W("JitAlignLoopAdaptive"), 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. +// Print the alignment boundaries in disassembly. +CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0) + CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0) CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1) CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1) // Uses only ASCII characters in tree dumps From 546785b20643b83714cd7aaf327df7d5593e4edd Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 2 Jul 2021 17:49:36 -0700 Subject: [PATCH 09/18] Improve bounds check output --- src/coreclr/jit/loopcloning.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index d7519568bc75b..fdefb3ed8658e 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -1320,10 +1320,14 @@ void Compiler::optPerformStaticOptimizations(unsigned loopNum, LoopCloneContext* for (unsigned dim = 0; dim <= arrIndexInfo->dim; dim++) { + GenTree* bndsChkNode = arrIndexInfo->arrIndex.bndsChks[dim]; + #ifdef DEBUG if (verbose) { - printf("Remove bounds check for stmt " FMT_STMT ", dim %d, ", arrIndexInfo->stmt->GetID(), dim); + printf("Remove bounds check "); + printTreeID(bndsChkNode->gtGetOp1()); + printf(" for " FMT_STMT ", dim% d, ", arrIndexInfo->stmt->GetID(), dim); arrIndexInfo->arrIndex.Print(); printf(", bounds check nodes: "); arrIndexInfo->arrIndex.PrintBoundsCheckNodes(); @@ -1331,7 +1335,6 @@ void Compiler::optPerformStaticOptimizations(unsigned loopNum, LoopCloneContext* } #endif // DEBUG - GenTree* bndsChkNode = arrIndexInfo->arrIndex.bndsChks[dim]; if (bndsChkNode->gtGetOp1()->OperIsBoundsCheck()) { // This COMMA node will only represent a bounds check if we've haven't already removed this @@ -2324,7 +2327,7 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop #ifdef DEBUG if (verbose) { - printf("Found ArrIndex at tree "); + printf("Found ArrIndex at " FMT_BB " " FMT_STMT " tree ", arrIndex.useBlock->bbNum, info->stmt->GetID()); printTreeID(tree); printf(" which is equivalent to: "); arrIndex.Print(); From 514b99aa2524a4098f35eb84d38c20b703736a73 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 2 Jul 2021 18:56:44 -0700 Subject: [PATCH 10/18] Improve emitter label printing Create function for printing bound emitter labels. Also, add debug code to associate a BasicBlock with an insGroup, and output the block number and ID with the emitter label in JitDump, so it's easier to find where a group of generated instructions came from. --- src/coreclr/jit/codegencommon.cpp | 4 +- src/coreclr/jit/codegenlinear.cpp | 2 +- src/coreclr/jit/emit.cpp | 120 ++++++++++++++++++++---------- src/coreclr/jit/emit.h | 11 ++- src/coreclr/jit/emitarm.cpp | 6 +- src/coreclr/jit/emitarm64.cpp | 8 +- src/coreclr/jit/emitxarch.cpp | 4 +- 7 files changed, 104 insertions(+), 51 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 78926e64fda65..f7e86748c4bfe 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -1070,7 +1070,7 @@ void CodeGen::genDefineTempLabel(BasicBlock* label) { genLogLabel(label); label->bbEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, - gcInfo.gcRegByrefSetCur, false DEBUG_ARG(label->bbNum)); + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(label)); } // genDefineInlineTempLabel: Define an inline label that does not affect the GC @@ -2066,7 +2066,7 @@ void CodeGen::genInsertNopForUnwinder(BasicBlock* block) block->bbUnwindNopEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, - false DEBUG_ARG(block->bbNum)); + false DEBUG_ARG(block)); instGen(INS_nop); } diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 8bf1f852ffd4d..e393862b11213 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -356,7 +356,7 @@ void CodeGen::genCodeForBBlist() // Mark a label and update the current set of live GC refs block->bbEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, - gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block->bbNum)); + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block)); } if (block == compiler->fgFirstColdBlock) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 44bec2c75fdf6..beec4a91a572f 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -843,7 +843,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) #ifdef DEBUG if (emitComp->opts.dspCode) { - printf("\n G_M%03u_IG%02u:", emitComp->compMethodID, ig->igNum); + printf("\n %s:", emitLabelString(ig)); if (emitComp->verbose) { printf(" ; offs=%06XH, funclet=%02u, bbWeight=%s", ig->igOffs, ig->igFuncIdx, @@ -2504,7 +2504,7 @@ bool emitter::emitNoGChelper(CORINFO_METHOD_HANDLE methHnd) void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMaskTP byrefRegs, - bool isFinallyTarget DEBUG_ARG(unsigned bbNum)) + bool isFinallyTarget DEBUG_ARG(BasicBlock* block)) { /* Create a new IG if the current one is non-empty */ @@ -2533,7 +2533,10 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, #endif // defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM) #ifdef DEBUG - JITDUMP("Mapped " FMT_BB " to G_M%03u_IG%02u\n", bbNum, emitComp->compMethodID, emitCurIG->igNum); + JITDUMP("Mapped " FMT_BB " to %s\n", block->bbNum, emitLabelString(emitCurIG)); + + assert(emitCurIG->igBlock == nullptr); + emitCurIG->igBlock = block; if (EMIT_GC_VERBOSE) { @@ -2548,6 +2551,7 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, printf("\n"); } #endif + return emitCurIG; } @@ -2561,6 +2565,40 @@ void* emitter::emitAddInlineLabel() return emitCurIG; } +#ifdef DEBUG + +//----------------------------------------------------------------------------- +// emitPrintLabel: Print the assembly label for an insGroup. We could use emitter::emitLabelString() +// to be consistent, but that seems silly. +// +void emitter::emitPrintLabel(insGroup* ig) +{ + printf("G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); +} + +//----------------------------------------------------------------------------- +// emitLabelString: Return label string for an insGroup, for use in debug output. +// This can be called up to four times in a single 'printf' before the static buffers +// get reused. +// +// Returns: +// String with insGroup label +// +const char* emitter::emitLabelString(insGroup* ig) +{ + const int TEMP_BUFFER_LEN = 40; + static unsigned curBuf = 0; + static char buf[4][TEMP_BUFFER_LEN]; + const char* retbuf; + + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); + retbuf = buf[curBuf]; + curBuf = (curBuf + 1) % 4; + return retbuf; +} + +#endif // DEBUG + #ifdef TARGET_ARMARCH // Does the argument location point to an IG at the end of a function or funclet? @@ -3502,7 +3540,7 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) const int TEMP_BUFFER_LEN = 40; char buff[TEMP_BUFFER_LEN]; - sprintf_s(buff, TEMP_BUFFER_LEN, "G_M%03u_IG%02u: ", emitComp->compMethodID, ig->igNum); + sprintf_s(buff, TEMP_BUFFER_LEN, "%s: ", emitLabelString(ig)); printf("%s; ", buff); // We dump less information when we're only interleaving GC info with a disassembly listing, @@ -3599,9 +3637,16 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) else { const char* separator = ""; + + if (jitdump && (ig->igBlock != nullptr)) + { + printf("%s%s", separator, ig->igBlock->dspToString()); + separator = ", "; + } + if (jitdump) { - printf("offs=%06XH, size=%04XH", ig->igOffs, ig->igSize); + printf("%soffs=%06XH, size=%04XH", separator, ig->igOffs, ig->igSize); separator = ", "; } @@ -4229,7 +4274,7 @@ void emitter::emitJumpDistBind() { if (tgtIG) { - printf(" to G_M%03u_IG%02u\n", emitComp->compMethodID, tgtIG->igNum); + printf(" to %s\n", emitLabelString(tgtIG)); } else { @@ -4687,8 +4732,7 @@ void emitter::emitLoopAlignment() // all IGs that follows this IG and participate in a loop. emitCurIG->igFlags |= IGF_LOOP_ALIGN; - JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", paddingBytes, emitComp->compMethodID, - emitCurIG->igNum); + JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); #ifdef DEBUG emitComp->loopAlignCandidates++; @@ -4973,9 +5017,9 @@ void emitter::emitLoopAlignAdjustments() alignInstr = prevAlignInstr; } - JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %u to %u.\n", emitComp->compMethodID, alignIG->igNum, + JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, actualPaddingNeeded); - JITDUMP("Adjusted size of G_M%03u_IG%02u from %u to %u.\n", emitComp->compMethodID, alignIG->igNum, + JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff), alignIG->igSize); } @@ -4985,7 +5029,7 @@ void emitter::emitLoopAlignAdjustments() insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) { - JITDUMP("Adjusted offset of G_M%03u_IG%02u from %04X to %04X\n", emitComp->compMethodID, adjOffIG->igNum, + JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs, (adjOffIG->igOffs - alignBytesRemoved)); adjOffIG->igOffs -= alignBytesRemoved; adjOffIG = adjOffIG->igNext; @@ -5045,8 +5089,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is already aligned if ((offset & (alignmentBoundary - 1)) == 0) { - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u already aligned at %dB boundary.'\n", - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", + emitLabelString(ig->igNext), alignmentBoundary); return 0; } @@ -5070,8 +5114,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is big if (loopSize > maxLoopSize) { - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is big. LoopSize= %d, MaxLoopSize= %d.'\n", - emitComp->compMethodID, ig->igNext->igNum, loopSize, maxLoopSize); + JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", + emitLabelString(ig->igNext), loopSize, maxLoopSize); return 0; } @@ -5097,17 +5141,16 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs if (nPaddingBytes == 0) { skipPadding = true; - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u already aligned at %uB boundary.'\n", - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %uB boundary.'\n", + emitLabelString(ig->igNext), alignmentBoundary); } // Check if the alignment exceeds new maxPadding limit else if (nPaddingBytes > nMaxPaddingBytes) { skipPadding = true; - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " + JITDUMP(";; Skip alignment: 'Loop at %s PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " "AlignmentBoundary= %dB.'\n", - emitComp->compMethodID, ig->igNext->igNum, nPaddingBytes, nMaxPaddingBytes, loopSize, - alignmentBoundary); + emitLabelString(ig->igNext), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); } } @@ -5129,8 +5172,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs else { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is aligned to fit in %d blocks of %d chunks.'\n", - emitComp->compMethodID, ig->igNext->igNum, minBlocksNeededForLoop, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", + emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); } } } @@ -5158,13 +5201,13 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs else { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is aligned to fit in %d blocks of %d chunks.'\n", - emitComp->compMethodID, ig->igNext->igNum, minBlocksNeededForLoop, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", + emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); } } - JITDUMP(";; Calculated padding to add %d bytes to align G_M%03u_IG%02u at %dB boundary.\n", paddingToAdd, - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Calculated padding to add %d bytes to align %s at %dB boundary.\n", paddingToAdd, + emitLabelString(ig->igNext), alignmentBoundary); // Either no padding is added because it is too expensive or the offset gets aligned // to the alignment boundary @@ -5846,7 +5889,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, } else { - printf("\nG_M%03u_IG%02u:", emitComp->compMethodID, ig->igNum); + printf("\n%s:", emitLabelString(ig)); if (!emitComp->opts.disDiffable) { printf(" ;; offset=%04XH", emitCurCodeOffs(cp)); @@ -6808,11 +6851,8 @@ void emitter::emitDispDataSec(dataSecDsc* section) BasicBlock* block = reinterpret_cast(data->dsCont)[i]; insGroup* ig = static_cast(emitCodeGetCookie(block)); - const char* blockLabelFormat = "G_M%03u_IG%02u"; - char blockLabel[64]; - char firstLabel[64]; - sprintf_s(blockLabel, _countof(blockLabel), blockLabelFormat, emitComp->compMethodID, ig->igNum); - sprintf_s(firstLabel, _countof(firstLabel), blockLabelFormat, emitComp->compMethodID, igFirst->igNum); + const char* blockLabel = emitLabelString(ig); + const char* firstLabel = emitLabelString(igFirst); if (isRelative) { @@ -7932,11 +7972,6 @@ insGroup* emitter::emitAllocIG() ig->igSelf = ig; #endif -#if defined(DEBUG) || defined(LATE_DISASM) - ig->igWeight = getCurrentBlockWeight(); - ig->igPerfScore = 0.0; -#endif - #if EMITTER_STATS emitTotalIGcnt += 1; emitTotalIGsize += sz; @@ -7974,6 +8009,11 @@ void emitter::emitInitIG(insGroup* ig) ig->igFlags = 0; +#if defined(DEBUG) || defined(LATE_DISASM) + ig->igWeight = getCurrentBlockWeight(); + ig->igPerfScore = 0.0; +#endif + /* Zero out some fields to avoid printing garbage in JitDumps. These really only need to be set in DEBUG, but do it in all cases to make sure we act the same in non-DEBUG builds. @@ -7986,6 +8026,10 @@ void emitter::emitInitIG(insGroup* ig) #if FEATURE_LOOP_ALIGN ig->igLoopBackEdge = nullptr; #endif + +#ifdef DEBUG + ig->igBlock = nullptr; +#endif } /***************************************************************************** @@ -8584,7 +8628,7 @@ const char* emitter::emitOffsetToLabel(unsigned offs) if (ig->igOffs == offs) { // Found it! - sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "%s", emitLabelString(ig)); retbuf = buf[curBuf]; curBuf = (curBuf + 1) % 4; return retbuf; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c2ea67869acbc..4c796c272cdcd 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -349,6 +349,10 @@ struct insGroup return (igFlags & IGF_LOOP_ALIGN) != 0; } +#ifdef DEBUG + BasicBlock* igBlock; // The BasicBlock that generated code into this insGroup. +#endif // DEBUG + }; // end of struct insGroup // For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes @@ -1904,13 +1908,18 @@ class emitter void* emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMaskTP byrefRegs, - bool isFinallyTarget = false DEBUG_ARG(unsigned bbNum = 0)); + bool isFinallyTarget = false DEBUG_ARG(BasicBlock* block = nullptr)); // Same as above, except the label is added and is conceptually "inline" in // the current block. Thus it extends the previous block and the emitter // continues to track GC info as if there was no label. void* emitAddInlineLabel(); +#ifdef DEBUG + void emitPrintLabel(insGroup* ig); + const char* emitLabelString(insGroup* ig); +#endif + #ifdef TARGET_ARMARCH void emitGetInstrDescs(insGroup* ig, instrDesc** id, int* insCnt); diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp index 6953df5b49a7c..5b01adbd14a5d 100644 --- a/src/coreclr/jit/emitarm.cpp +++ b/src/coreclr/jit/emitarm.cpp @@ -7292,7 +7292,7 @@ void emitter::emitDispInsHelp( lab = (insGroup*)emitCodeGetCookie(*bbp++); assert(lab); - printf("\n DD G_M%03u_IG%02u", emitComp->compMethodID, lab->igNum); + printf("\n DD %s", emitLabelString(lab)); } while (--cnt); } } @@ -7601,7 +7601,7 @@ void emitter::emitDispInsHelp( case IF_T2_M1: // Load Label emitDispReg(id->idReg1(), attr, true); if (id->idIsBound()) - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); else printf("L_M%03u_" FMT_BB, emitComp->compMethodID, id->idAddr()->iiaBBlabel->bbNum); break; @@ -7646,7 +7646,7 @@ void emitter::emitDispInsHelp( } } else if (id->idIsBound()) - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); else printf("L_M%03u_" FMT_BB, emitComp->compMethodID, id->idAddr()->iiaBBlabel->bbNum); } diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 37e19a219d265..84c49d94723d4 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -12286,7 +12286,7 @@ void emitter::emitDispIns( } else if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12324,7 +12324,7 @@ void emitter::emitDispIns( emitDispReg(id->idReg1(), size, true); if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12338,7 +12338,7 @@ void emitter::emitDispIns( emitDispImm(emitGetInsSC(id), true); if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12463,7 +12463,7 @@ void emitter::emitDispIns( } else if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index f4c5d30256710..2fc1c467ed729 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -8452,7 +8452,7 @@ void emitter::emitDispAddrMode(instrDesc* id, bool noDetail) lab = (insGroup*)emitCodeGetCookie(*bbp++); assert(lab); - printf("\n D" SIZE_LETTER " G_M%03u_IG%02u", emitComp->compMethodID, lab->igNum); + printf("\n D" SIZE_LETTER " %s", emitLabelString(lab)); } while (--cnt); } } @@ -9649,7 +9649,7 @@ void emitter::emitDispIns( } else { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } } else From a3cecf7fcf33ee9f47c4a7ad6d02407a85c1a383 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 2 Jul 2021 19:01:19 -0700 Subject: [PATCH 11/18] Formatting --- src/coreclr/jit/codegencommon.cpp | 5 ++--- src/coreclr/jit/emit.cpp | 24 ++++++++++++------------ src/coreclr/jit/emit.h | 6 +++--- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index f7e86748c4bfe..820533da00065 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -2064,9 +2064,8 @@ void CodeGen::genInsertNopForUnwinder(BasicBlock* block) // block starts an EH region. If we pointed the existing bbEmitCookie here, then the NOP // would be executed, which we would prefer not to do. - block->bbUnwindNopEmitCookie = - GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, - false DEBUG_ARG(block)); + block->bbUnwindNopEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block)); instGen(INS_nop); } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index beec4a91a572f..1aca2912cca2d 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -2586,8 +2586,8 @@ void emitter::emitPrintLabel(insGroup* ig) // const char* emitter::emitLabelString(insGroup* ig) { - const int TEMP_BUFFER_LEN = 40; - static unsigned curBuf = 0; + const int TEMP_BUFFER_LEN = 40; + static unsigned curBuf = 0; static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; @@ -5017,10 +5017,10 @@ void emitter::emitLoopAlignAdjustments() alignInstr = prevAlignInstr; } - JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), - estimatedPaddingNeeded, actualPaddingNeeded); - JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), - (alignIG->igSize + diff), alignIG->igSize); + JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, + actualPaddingNeeded); + JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff), + alignIG->igSize); } // Adjust the offset of all IGs starting from next IG until we reach the IG having the next @@ -5029,8 +5029,8 @@ void emitter::emitLoopAlignAdjustments() insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) { - JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), - adjOffIG->igOffs, (adjOffIG->igOffs - alignBytesRemoved)); + JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs, + (adjOffIG->igOffs - alignBytesRemoved)); adjOffIG->igOffs -= alignBytesRemoved; adjOffIG = adjOffIG->igNext; } @@ -5089,8 +5089,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is already aligned if ((offset & (alignmentBoundary - 1)) == 0) { - JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", - emitLabelString(ig->igNext), alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(ig->igNext), + alignmentBoundary); return 0; } @@ -5114,8 +5114,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is big if (loopSize > maxLoopSize) { - JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", - emitLabelString(ig->igNext), loopSize, maxLoopSize); + JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(ig->igNext), + loopSize, maxLoopSize); return 0; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4c796c272cdcd..d811646e2e28f 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -351,7 +351,7 @@ struct insGroup #ifdef DEBUG BasicBlock* igBlock; // The BasicBlock that generated code into this insGroup. -#endif // DEBUG +#endif // DEBUG }; // end of struct insGroup @@ -1233,7 +1233,7 @@ class emitter #define PERFSCORE_THROUGHPUT_ILLEGAL -1024.0f -#define PERFSCORE_THROUGHPUT_ZERO 0.0f // Only used for pseudo-instructions that don't generate code +#define PERFSCORE_THROUGHPUT_ZERO 0.0f // Only used for pseudo-instructions that don't generate code #define PERFSCORE_THROUGHPUT_6X (1.0f / 6.0f) // Hextuple issue #define PERFSCORE_THROUGHPUT_5X 0.20f // Pentuple issue @@ -1908,7 +1908,7 @@ class emitter void* emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMaskTP byrefRegs, - bool isFinallyTarget = false DEBUG_ARG(BasicBlock* block = nullptr)); + bool isFinallyTarget = false DEBUG_ARG(BasicBlock* block = nullptr)); // Same as above, except the label is added and is conceptually "inline" in // the current block. Thus it extends the previous block and the emitter From 141c44ba485aeb6ac4d056eed418642fc60bba41 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 6 Jul 2021 17:33:49 -0700 Subject: [PATCH 12/18] Clear BBF_LOOP_PREHEADER bit when compacting empty pre-header block --- src/coreclr/jit/fgopt.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 3b1b1739919d5..92a82c346a545 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1620,6 +1620,9 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } bNext->bbPreds = nullptr; + + // `block` can no longer be a loop pre-header (if it was before). + block->bbFlags &= ~BBF_LOOP_PREHEADER; } else { From 653a7e21e84d9e25ae47c762e3ac3c7a869f0b55 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 6 Jul 2021 17:34:29 -0700 Subject: [PATCH 13/18] Keep track of all basic blocks that contribute code to an insGroup --- src/coreclr/jit/emit.cpp | 35 +++++++++++++++++++++++++---------- src/coreclr/jit/emit.h | 5 +++-- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 1aca2912cca2d..41d63a6c63150 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1524,6 +1524,14 @@ void* emitter::emitAllocAnyInstr(size_t sz, emitAttr opsz) emitCurIGinsCnt++; +#ifdef DEBUG + if (emitComp->compCurBB != emitCurIG->lastGeneratedBlock) + { + emitCurIG->igBlocks.push_back(emitComp->compCurBB); + emitCurIG->lastGeneratedBlock = emitComp->compCurBB; + } +#endif // DEBUG + return id; } @@ -2535,9 +2543,6 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, #ifdef DEBUG JITDUMP("Mapped " FMT_BB " to %s\n", block->bbNum, emitLabelString(emitCurIG)); - assert(emitCurIG->igBlock == nullptr); - emitCurIG->igBlock = block; - if (EMIT_GC_VERBOSE) { printf("Label: IG%02u, GCvars=%s ", emitCurIG->igNum, VarSetOps::ToString(emitComp, GCvars)); @@ -3638,12 +3643,6 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) { const char* separator = ""; - if (jitdump && (ig->igBlock != nullptr)) - { - printf("%s%s", separator, ig->igBlock->dspToString()); - separator = ", "; - } - if (jitdump) { printf("%soffs=%06XH, size=%04XH", separator, ig->igOffs, ig->igSize); @@ -3687,6 +3686,15 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) } #endif // FEATURE_LOOP_ALIGN + if (jitdump && !ig->igBlocks.empty()) + { + for (auto block : ig->igBlocks) + { + printf("%s%s", separator, block->dspToString()); + separator = ", "; + } + } + emitDispIGflags(ig->igFlags); if (ig == emitCurIG) @@ -8028,7 +8036,9 @@ void emitter::emitInitIG(insGroup* ig) #endif #ifdef DEBUG - ig->igBlock = nullptr; + ig->lastGeneratedBlock = nullptr; + // Explicitly call the constructor, since IGs don't actually have a constructor. + ig->igBlocks.jitstd::list::list(emitComp->getAllocator(CMK_LoopOpt)); #endif } @@ -8094,6 +8104,11 @@ void emitter::emitNxtIG(bool extend) // We've created a new IG; no need to force another one. emitForceNewIG = false; + +#ifdef DEBUG + // We haven't written any code into the IG yet, so clear our record of the last block written to the IG. + emitCurIG->lastGeneratedBlock = nullptr; +#endif } /***************************************************************************** diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index d811646e2e28f..5c3ce04ead955 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -350,8 +350,9 @@ struct insGroup } #ifdef DEBUG - BasicBlock* igBlock; // The BasicBlock that generated code into this insGroup. -#endif // DEBUG + BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. + jitstd::list igBlocks; // All the blocks that generated code into this insGroup. +#endif }; // end of struct insGroup From 82f28282c451fdbb5892763a7ee6f8cee6dd61e4 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 7 Jul 2021 13:28:47 -0700 Subject: [PATCH 14/18] Update display of Intel jcc erratum branches in dump For instructions or instruction sequences which match the Intel jcc erratum criteria, note that in the alignment boundary dump. Also, a few fixes: 1. Move the alignment boundary dumping from `emitIssue1Instr` to `emitEndCodeGen` to avoid the possibility of reading the next instruction in a group when there is no next instruction. 2. Create `IsJccInstruction` and `IsJmpInstruction` functions for use by the jcc criteria detection, and fix that detection to fix a few omissions/errors. 3. Change the jcc criteria detection to be hard-coded to 32 byte boundaries instead of assuming `compJitAlignLoopBoundary` is 32. An example: ``` cmp r11d, dword ptr [rax+8] ; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (cmp: 0 ; jcc erratum) 32B boundary ............................... jae G_M42486_IG103 ``` In this case, the `cmp` doesn't cross the boundary, it is adjacent (the zero indicates the number of bytes of the instruction which cross the boundary), followed by the `jae` which starts after the boundary. Indicating the jcc erratum criteria can help point out potential performance issues due to unlucky alignment of these instructions in asm diffs. --- src/coreclr/jit/emit.cpp | 141 ++++++++++++++++++++++------------ src/coreclr/jit/emit.h | 4 +- src/coreclr/jit/emitxarch.cpp | 32 ++++++++ src/coreclr/jit/emitxarch.h | 3 + 4 files changed, 127 insertions(+), 53 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 41d63a6c63150..e5d026eb6418e 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -773,7 +773,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) memcpy(id, emitCurIGfreeBase, sz); #ifdef DEBUG - if (false && emitComp->verbose) // this is not useful in normal dumps (hence it is normally under if (false) + if (false && emitComp->verbose) // this is not useful in normal dumps (hence it is normally under if (false)) { // If there's an error during emission, we may want to connect the post-copy address // of an instrDesc with the pre-copy address (the one that was originally created). This @@ -3786,10 +3786,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) { size_t is; -#ifdef DEBUG - size_t beforeAddr = (size_t)*dp; -#endif - /* Record the beginning offset of the instruction */ BYTE* curInsAdr = *dp; @@ -3875,51 +3871,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id)); assert(is == emitSizeOfInsDsc(id)); } - - // Print the alignment boundary - if ((emitComp->opts.disAsm || emitComp->verbose) && (emitComp->opts.disAddr || emitComp->opts.disAlignment)) - { - size_t currAddr = (size_t)*dp; - size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1); - - // draw boundary if beforeAddr was before the lastBoundary. - if (beforeAddr < lastBoundaryAddr) - { - printf("; "); - instruction currIns = id->idIns(); - -#if defined(TARGET_XARCH) - - // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf - bool isJccAffectedIns = - ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret)); - - instrDesc* nextId = id; - castto(nextId, BYTE*) += is; - instruction nextIns = nextId->idIns(); - if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) || - (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec)) - { - isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align); - } -#else - bool isJccAffectedIns = false; -#endif - - // Indicate if instruction is at at 32B boundary or is splitted - unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1)); - if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0)) - { - printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()), - bytesCrossedBoundary); - } - else - { - printf("..............................."); - } - printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary)); - } - } #endif // DEBUG return is; @@ -6014,9 +5965,97 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, emitCurIG = ig; - for (unsigned cnt = ig->igInsCnt; cnt; cnt--) + for (unsigned cnt = ig->igInsCnt; cnt > 0; cnt--) { +#ifdef DEBUG + size_t curInstrAddr = (size_t)cp; + instrDesc* curInstrDesc = id; +#endif + castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp); + +#ifdef DEBUG + // Print the alignment boundary + if ((emitComp->opts.disAsm || emitComp->verbose) && (emitComp->opts.disAddr || emitComp->opts.disAlignment)) + { + size_t afterInstrAddr = (size_t)cp; + instruction curIns = curInstrDesc->idIns(); + bool isJccAffectedIns = false; + +#if defined(TARGET_XARCH) + + // Determine if this instruction is part of a set that matches the Intel jcc erratum characteristic + // described here: + // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + // This is the case when a jump instruction crosses a 32-byte boundary, or ends on a 32-byte boundary. + // "Jump instruction" in this case includes conditional jump (jcc), macro-fused op-jcc (where 'op' is + // one of cmp, test, add, sub, and, inc, or dec), direct unconditional jump, indirect jump, + // direct/indirect call, and return. + + size_t jccAlignBoundary = 32; + size_t jccAlignBoundaryMask = jccAlignBoundary - 1; + size_t jccLastBoundaryAddr = afterInstrAddr & ~jccAlignBoundaryMask; + + if (curInstrAddr < jccLastBoundaryAddr) + { + isJccAffectedIns = IsJccInstruction(curIns) || IsJmpInstruction(curIns) || (curIns == INS_call) || + (curIns == INS_ret); + + // For op-Jcc there are two cases: (1) curIns is the jcc, in which case the above condition + // already covers us. (2) curIns is the `op` and the next instruction is the `jcc`. Note that + // we will never have a `jcc` as the first instruction of a group, so we don't need to worry + // about looking ahead to the next group after a an `op` of `op-Jcc`. + + if (!isJccAffectedIns && (cnt > 1)) + { + // The current `id` is valid, namely, there is another instruction in this group. + instruction nextIns = id->idIns(); + if (((curIns == INS_cmp) || (curIns == INS_test) || (curIns == INS_add) || + (curIns == INS_sub) || (curIns == INS_and) || (curIns == INS_inc) || + (curIns == INS_dec)) && + IsJccInstruction(nextIns)) + { + isJccAffectedIns = true; + } + } + + if (isJccAffectedIns) + { + unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & jccAlignBoundaryMask); + printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d ; jcc erratum) %dB boundary " + "...............................\n", + codeGen->genInsName(curIns), bytesCrossedBoundary, jccAlignBoundary); + } + } + +#endif // TARGET_XARCH + + // Jcc affected instruction boundaries were printed above; handle other cases here. + if (!isJccAffectedIns) + { + size_t alignBoundaryMask = (size_t)emitComp->opts.compJitAlignLoopBoundary - 1; + size_t lastBoundaryAddr = afterInstrAddr & ~alignBoundaryMask; + + // draw boundary if beforeAddr was before the lastBoundary. + if (curInstrAddr < lastBoundaryAddr) + { + // Indicate if instruction is at the alignment boundary or is split + unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & alignBoundaryMask); + if (bytesCrossedBoundary != 0) + { + printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(curIns), + bytesCrossedBoundary); + } + else + { + printf("; ..............................."); + } + printf(" %dB boundary ...............................\n", + emitComp->opts.compJitAlignLoopBoundary); + } + } + } +#endif // DEBUG } #ifdef DEBUG diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 5c3ce04ead955..dee571837b79f 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -350,8 +350,8 @@ struct insGroup } #ifdef DEBUG - BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. - jitstd::list igBlocks; // All the blocks that generated code into this insGroup. + BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. + jitstd::list igBlocks; // All the blocks that generated code into this insGroup. #endif }; // end of struct insGroup diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2fc1c467ed729..0acfff02c757c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -4292,6 +4292,38 @@ bool emitter::IsMovInstruction(instruction ins) } } +//------------------------------------------------------------------------ +// IsJccInstruction: Determine if an instruction is a conditional jump instruction. +// +// Arguments: +// ins -- The instruction being checked +// +// Return Value: +// true if the instruction qualifies; otherwise, false +// +bool emitter::IsJccInstruction(instruction ins) +{ + return ((ins >= INS_jo) && (ins <= INS_jg)) || ((ins >= INS_l_jo) && (ins <= INS_l_jg)); +} + +//------------------------------------------------------------------------ +// IsJmpInstruction: Determine if an instruction is a jump instruction but NOT a conditional jump instruction. +// +// Arguments: +// ins -- The instruction being checked +// +// Return Value: +// true if the instruction qualifies; otherwise, false +// +bool emitter::IsJmpInstruction(instruction ins) +{ + return +#ifdef TARGET_AMD64 + (ins == INS_rex_jmp) || +#endif + (ins == INS_i_jmp) || (ins == INS_jmp) || (ins == INS_l_jmp); +} + //---------------------------------------------------------------------------------------- // IsRedundantMov: // Check if the current `mov` instruction is redundant and can be omitted. diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 8260445686be0..295486120e29b 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -98,6 +98,9 @@ static bool IsMovInstruction(instruction ins); bool IsRedundantMov( instruction ins, insFormat fmt, emitAttr size, regNumber dst, regNumber src, bool canIgnoreSideEffects); +static bool IsJccInstruction(instruction ins); +static bool IsJmpInstruction(instruction ins); + bool AreUpper32BitsZero(regNumber reg); bool AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, genTreeOps treeOps); From 1ecd09178f499db05e55ce068fb34ccf4c342631 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 7 Jul 2021 14:30:55 -0700 Subject: [PATCH 15/18] Display full instruction name in alignment and other messages XArch sometimes prepends a "v" to the instructions names from the instruction table. Add a function `genInsDisplayName` to create the full instruction name that should be displayed, and use that in most places an instruction name will be displayed, such as in the alignment messages, and normal disassembly. Use this instead of the raw `genInsName`. This could be extended to handle arm32 appending an "s", but I didn't want to touch arm32 with this change. --- src/coreclr/jit/codegen.h | 5 +-- src/coreclr/jit/emit.cpp | 8 ++--- src/coreclr/jit/emitxarch.cpp | 38 +++++++++----------- src/coreclr/jit/emitxarch.h | 15 ++++++-- src/coreclr/jit/instr.cpp | 68 +++++++++++++++++++---------------- 5 files changed, 70 insertions(+), 64 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 132a763c06b01..626cb3e5b7bd6 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -216,6 +216,7 @@ class CodeGen final : public CodeGenInterface unsigned genCurDispOffset; static const char* genInsName(instruction ins); + const char* genInsDisplayName(emitter::instrDesc* id); #endif // DEBUG //------------------------------------------------------------------------- @@ -1503,10 +1504,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void instGen_Store_Reg_Into_Lcl(var_types dstType, regNumber srcReg, int varNum, int offs); -#ifdef DEBUG - void __cdecl instDisp(instruction ins, bool noNL, const char* fmt, ...); -#endif - #ifdef TARGET_XARCH instruction genMapShiftInsToShiftByConstantIns(instruction ins, int shiftByValue); #endif // TARGET_XARCH diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index e5d026eb6418e..04e40ac282663 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1241,7 +1241,7 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult) { #ifdef DEBUG - printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), + printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsDisplayName(id), emitIfName(id->idInsFmt())); assert(!"PerfScore: unhandled instruction"); #endif @@ -6024,7 +6024,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & jccAlignBoundaryMask); printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d ; jcc erratum) %dB boundary " "...............................\n", - codeGen->genInsName(curIns), bytesCrossedBoundary, jccAlignBoundary); + codeGen->genInsDisplayName(curInstrDesc), bytesCrossedBoundary, jccAlignBoundary); } } @@ -6043,8 +6043,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & alignBoundaryMask); if (bytesCrossedBoundary != 0) { - printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(curIns), - bytesCrossedBoundary); + printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", + codeGen->genInsDisplayName(curInstrDesc), bytesCrossedBoundary); } else { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 0acfff02c757c..64711d645e10f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -24,37 +24,37 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" -bool IsSSEInstruction(instruction ins) +bool emitter::IsSSEInstruction(instruction ins) { return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_SSE_INSTRUCTION); } -bool IsSSEOrAVXInstruction(instruction ins) +bool emitter::IsSSEOrAVXInstruction(instruction ins) { return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } -bool IsAVXOnlyInstruction(instruction ins) +bool emitter::IsAVXOnlyInstruction(instruction ins) { return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } -bool IsFMAInstruction(instruction ins) +bool emitter::IsFMAInstruction(instruction ins) { return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION); } -bool IsAVXVNNIInstruction(instruction ins) +bool emitter::IsAVXVNNIInstruction(instruction ins) { return (ins >= INS_FIRST_AVXVNNI_INSTRUCTION) && (ins <= INS_LAST_AVXVNNI_INSTRUCTION); } -bool IsBMIInstruction(instruction ins) +bool emitter::IsBMIInstruction(instruction ins) { return (ins >= INS_FIRST_BMI_INSTRUCTION) && (ins <= INS_LAST_BMI_INSTRUCTION); } -regNumber getBmiRegNumber(instruction ins) +regNumber emitter::getBmiRegNumber(instruction ins) { switch (ins) { @@ -81,7 +81,7 @@ regNumber getBmiRegNumber(instruction ins) } } -regNumber getSseShiftRegNumber(instruction ins) +regNumber emitter::getSseShiftRegNumber(instruction ins) { switch (ins) { @@ -123,7 +123,7 @@ regNumber getSseShiftRegNumber(instruction ins) } } -bool emitter::IsAVXInstruction(instruction ins) +bool emitter::IsAVXInstruction(instruction ins) const { return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); } @@ -438,7 +438,7 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) // Returns true if this instruction requires a VEX prefix // All AVX instructions require a VEX prefix -bool emitter::TakesVexPrefix(instruction ins) +bool emitter::TakesVexPrefix(instruction ins) const { // special case vzeroupper as it requires 2-byte VEX prefix // special case the fencing, movnti and the prefetch instructions as they never take a VEX prefix @@ -514,7 +514,7 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att } // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix -bool TakesRexWPrefix(instruction ins, emitAttr attr) +bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) { // Because the current implementation of AVX does not have a way to distinguish between the register // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are @@ -8716,22 +8716,16 @@ void emitter::emitDispIns( /* Display the instruction name */ - sstr = codeGen->genInsName(ins); + sstr = codeGen->genInsDisplayName(id); + printf(" %-9s", sstr); - if (IsAVXInstruction(ins) && !IsBMIInstruction(ins)) - { - printf(" v%-8s", sstr); - } - else - { - printf(" %-9s", sstr); - } #ifndef HOST_UNIX - if (strnlen_s(sstr, 10) >= 8) + if (strnlen_s(sstr, 10) >= 9) #else // HOST_UNIX - if (strnlen(sstr, 10) >= 8) + if (strnlen(sstr, 10) >= 9) #endif // HOST_UNIX { + // Make sure there's at least one space after the instruction name, for very long instruction names. printf(" "); } diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 295486120e29b..f952a6f649f44 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -83,7 +83,15 @@ code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); -bool IsAVXInstruction(instruction ins); +static bool IsSSEInstruction(instruction ins); +static bool IsSSEOrAVXInstruction(instruction ins); +static bool IsAVXOnlyInstruction(instruction ins); +static bool IsFMAInstruction(instruction ins); +static bool IsAVXVNNIInstruction(instruction ins); +static bool IsBMIInstruction(instruction ins); +static regNumber getBmiRegNumber(instruction ins); +static regNumber getSseShiftRegNumber(instruction ins); +bool IsAVXInstruction(instruction ins) const; code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); code_t AddRexWPrefix(instruction ins, code_t code); @@ -119,7 +127,8 @@ bool hasRexPrefix(code_t code) #define VEX_PREFIX_MASK_3BYTE 0xFF000000000000ULL #define VEX_PREFIX_CODE_3BYTE 0xC4000000000000ULL -bool TakesVexPrefix(instruction ins); +bool TakesVexPrefix(instruction ins) const; +static bool TakesRexWPrefix(instruction ins, emitAttr attr); // Returns true if the instruction encoding already contains VEX prefix bool hasVexPrefix(code_t code) @@ -145,7 +154,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr } bool useVEXEncodings; -bool UseVEXEncoding() +bool UseVEXEncoding() const { return useVEXEncodings; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 9ce10458fb08e..2fab211ee19b3 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -24,11 +24,12 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /*****************************************************************************/ #ifdef DEBUG -/***************************************************************************** - * - * Returns the string representation of the given CPU instruction. - */ - +//----------------------------------------------------------------------------- +// genInsName: Returns the string representation of the given CPU instruction, as +// it exists in the instruction table. Note that some architectures don't encode the +// name completely in the table: xarch sometimes prepends a "v", and arm sometimes +// appends a "s". Use `genInsDisplayName()` to get a fully-formed name. +// const char* CodeGen::genInsName(instruction ins) { // clang-format off @@ -77,36 +78,41 @@ const char* CodeGen::genInsName(instruction ins) return insNames[ins]; } -void __cdecl CodeGen::instDisp(instruction ins, bool noNL, const char* fmt, ...) +//----------------------------------------------------------------------------- +// genInsDisplayName: Get a fully-formed instruction display name. This only handles +// the xarch case of prepending a "v", not the arm case of appending an "s". +// This can be called up to four times in a single 'printf' before the static buffers +// get reused. +// +// Returns: +// String with instruction name +// +const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) { - if (compiler->opts.dspCode) - { - /* Display the instruction offset within the emit block */ - - // printf("[%08X:%04X]", GetEmitter().emitCodeCurBlock(), GetEmitter().emitCodeOffsInBlock()); - - /* Display the FP stack depth (before the instruction is executed) */ - - // printf("[FP=%02u] ", genGetFPstkLevel()); - - /* Display the instruction mnemonic */ - printf(" "); + instruction ins = id->idIns(); + const char* insName = genInsName(ins); - printf(" %-8s", genInsName(ins)); + const int TEMP_BUFFER_LEN = 40; + static unsigned curBuf = 0; + static char buf[4][TEMP_BUFFER_LEN]; + const char* retbuf; - if (fmt) - { - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); - } - - if (!noNL) - { - printf("\n"); - } +#ifdef TARGET_XARCH + if (GetEmitter()->IsAVXInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + { + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); + retbuf = buf[curBuf]; + curBuf = (curBuf + 1) % 4; + return retbuf; + } + else + { + // If we don't need any adjustments, just return it directly. + return insName; } +#else + return insName; +#endif } /*****************************************************************************/ From 35de73fb220fa2aab5568cc78bddc976de5a2787 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Wed, 7 Jul 2021 18:22:41 -0700 Subject: [PATCH 16/18] Fix build --- src/coreclr/jit/emit.h | 10 +++++----- src/coreclr/jit/instr.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index dee571837b79f..e4bde18562f64 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -247,6 +247,11 @@ struct insGroup double igPerfScore; // The PerfScore for this insGroup #endif +#ifdef DEBUG + BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. + jitstd::list igBlocks; // All the blocks that generated code into this insGroup. +#endif + UNATIVE_OFFSET igNum; // for ordering (and display) purposes UNATIVE_OFFSET igOffs; // offset of this group within method unsigned int igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.) @@ -349,11 +354,6 @@ struct insGroup return (igFlags & IGF_LOOP_ALIGN) != 0; } -#ifdef DEBUG - BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. - jitstd::list igBlocks; // All the blocks that generated code into this insGroup. -#endif - }; // end of struct insGroup // For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 2fab211ee19b3..39aa801f4e793 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -92,12 +92,12 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) instruction ins = id->idIns(); const char* insName = genInsName(ins); +#ifdef TARGET_XARCH const int TEMP_BUFFER_LEN = 40; static unsigned curBuf = 0; static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; -#ifdef TARGET_XARCH if (GetEmitter()->IsAVXInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); From 3bb31102aee503d559201428917313d081c69341 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 9 Jul 2021 12:47:05 -0700 Subject: [PATCH 17/18] Code review feedback 1. Rename GTF_INX_NONFAULTING to GTF_INX_NOFAULT to increase clarity compared to existing GTF_IND_NONFAULTING. 2. Minor cleanup in getInsDisplayName. --- src/coreclr/jit/gentree.h | 2 +- src/coreclr/jit/instr.cpp | 9 ++------- src/coreclr/jit/loopcloning.cpp | 2 +- src/coreclr/jit/morph.cpp | 8 ++++---- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f88f3c2f775fc..aed7bef018fbb 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -494,7 +494,7 @@ enum GenTreeFlags : unsigned int GTF_INX_RNGCHK = 0x80000000, // GT_INDEX/GT_INDEX_ADDR -- the array reference should be range-checked. GTF_INX_STRING_LAYOUT = 0x40000000, // GT_INDEX -- this uses the special string array layout - GTF_INX_NONFAULTING = 0x20000000, // GT_INDEX -- the INDEX does not throw an exception (morph to GTF_IND_NONFAULTING) + GTF_INX_NOFAULT = 0x20000000, // GT_INDEX -- the INDEX does not throw an exception (morph to GTF_IND_NONFAULTING) GTF_IND_TGT_NOT_HEAP = 0x80000000, // GT_IND -- the target is not on the heap GTF_IND_VOLATILE = 0x40000000, // GT_IND -- the load or store must use volatile sematics (this is a nop on X86) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 39aa801f4e793..752626f20184d 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -105,14 +105,9 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) curBuf = (curBuf + 1) % 4; return retbuf; } - else - { - // If we don't need any adjustments, just return it directly. - return insName; - } -#else +#endif // TARGET_XARCH + return insName; -#endif } /*****************************************************************************/ diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index fdefb3ed8658e..c3991e663e1d9 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -77,7 +77,7 @@ GenTree* LC_Array::ToGenTree(Compiler* comp, BasicBlock* bb) // Clear the range check flag and mark the index as non-faulting: we guarantee that all necessary range // checking has already been done by the time this array index expression is invoked. arr->gtFlags &= ~(GTF_INX_RNGCHK | GTF_EXCEPT); - arr->gtFlags |= GTF_INX_NONFAULTING; + arr->gtFlags |= GTF_INX_NOFAULT; } // If asked for arrlen invoke arr length operator. if (oper == ArrLen) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index c8dec1c1da1ad..a64676cc8a8d1 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -5577,8 +5577,8 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) GenTree* arrRef = asIndex->Arr(); GenTree* index = asIndex->Index(); - bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled - bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NONFAULTING) != 0); // if true, mark GTF_IND_NONFAULTING + bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled + bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NOFAULT) != 0); // if true, mark GTF_IND_NONFAULTING bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); GenTree* arrRefDefn = nullptr; // non-NULL if we need to allocate a temp for the arrRef expression @@ -5739,9 +5739,9 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) this->compFloatingPointUsed = true; } - // We've now consumed the GTF_INX_RNGCHK and GTF_INX_NONFAULTING, and the node + // We've now consumed the GTF_INX_RNGCHK and GTF_INX_NOFAULT, and the node // is no longer a GT_INDEX node. - tree->gtFlags &= ~(GTF_INX_RNGCHK | GTF_INX_NONFAULTING); + tree->gtFlags &= ~(GTF_INX_RNGCHK | GTF_INX_NOFAULT); tree->AsOp()->gtOp1 = addr; From 182f57932293d3e42506807879e1963ff6e96a9e Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 9 Jul 2021 15:32:55 -0700 Subject: [PATCH 18/18] Formatting --- src/coreclr/jit/morph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index a64676cc8a8d1..915aee27f1f9e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -5577,8 +5577,8 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) GenTree* arrRef = asIndex->Arr(); GenTree* index = asIndex->Index(); - bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled - bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NOFAULT) != 0); // if true, mark GTF_IND_NONFAULTING + bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled + bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NOFAULT) != 0); // if true, mark GTF_IND_NONFAULTING bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); GenTree* arrRefDefn = nullptr; // non-NULL if we need to allocate a temp for the arrRef expression