diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 0cbe51281b353..5d42a1626536f 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -29,7 +29,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "jithashtable.h" /*****************************************************************************/ -typedef BitVec EXPSET_TP; +typedef BitVec EXPSET_TP; +typedef BitVec_ValArg_T EXPSET_VALARG_TP; +typedef BitVec_ValRet_T EXPSET_VALRET_TP; + #if LARGE_EXPSET #define EXPSET_SZ 64 #else diff --git a/src/coreclr/jit/clrjit.natvis b/src/coreclr/jit/clrjit.natvis index 90a9ff703a471..b3c187474900f 100644 --- a/src/coreclr/jit/clrjit.natvis +++ b/src/coreclr/jit/clrjit.natvis @@ -5,6 +5,13 @@ Licensed to the .NET Foundation under one or more agreements. The .NET Foundation licenses this file to you under the MIT license. --> + @@ -183,4 +190,48 @@ The .NET Foundation licenses this file to you under the MIT license. + + size={m_nSize,d} capacity={m_nCapacity,d} + Empty + + + m_nSize + m_pArray + + + + + + size={m_size,d} + Empty + + + m_size + m_members + + + + + + size={m_size,d} used={m_used,d} + Empty + + + m_used + m_members + + + + + + + + + {optType,en} + + (LcJaggedArrayOptInfo*)this,nd + (LcMdArrayOptInfo*)this,nd + + + diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 132a763c06b01..626cb3e5b7bd6 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -216,6 +216,7 @@ class CodeGen final : public CodeGenInterface unsigned genCurDispOffset; static const char* genInsName(instruction ins); + const char* genInsDisplayName(emitter::instrDesc* id); #endif // DEBUG //------------------------------------------------------------------------- @@ -1503,10 +1504,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void instGen_Store_Reg_Into_Lcl(var_types dstType, regNumber srcReg, int varNum, int offs); -#ifdef DEBUG - void __cdecl instDisp(instruction ins, bool noNL, const char* fmt, ...); -#endif - #ifdef TARGET_XARCH instruction genMapShiftInsToShiftByConstantIns(instruction ins, int shiftByValue); #endif // TARGET_XARCH diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 78926e64fda65..820533da00065 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -1070,7 +1070,7 @@ void CodeGen::genDefineTempLabel(BasicBlock* label) { genLogLabel(label); label->bbEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, - gcInfo.gcRegByrefSetCur, false DEBUG_ARG(label->bbNum)); + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(label)); } // genDefineInlineTempLabel: Define an inline label that does not affect the GC @@ -2064,9 +2064,8 @@ void CodeGen::genInsertNopForUnwinder(BasicBlock* block) // block starts an EH region. If we pointed the existing bbEmitCookie here, then the NOP // would be executed, which we would prefer not to do. - block->bbUnwindNopEmitCookie = - GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, gcInfo.gcRegByrefSetCur, - false DEBUG_ARG(block->bbNum)); + block->bbUnwindNopEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block)); instGen(INS_nop); } diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 8bf1f852ffd4d..e393862b11213 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -356,7 +356,7 @@ void CodeGen::genCodeForBBlist() // Mark a label and update the current set of live GC refs block->bbEmitCookie = GetEmitter()->emitAddLabel(gcInfo.gcVarPtrSetCur, gcInfo.gcRegGCrefSetCur, - gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block->bbNum)); + gcInfo.gcRegByrefSetCur, false DEBUG_ARG(block)); } if (block == compiler->fgFirstColdBlock) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 7019d8afad6c5..bf49e63ab0cbb 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2988,6 +2988,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.disAsmSpilled = false; opts.disDiffable = false; opts.disAddr = false; + opts.disAlignment = false; opts.dspCode = false; opts.dspEHTable = false; opts.dspDebugInfo = false; @@ -3136,6 +3137,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.disAddr = true; } + if (JitConfig.JitDasmWithAlignmentBoundaries() != 0) + { + opts.disAlignment = true; + } + if (JitConfig.JitLongAddress() != 0) { opts.compLongAddress = true; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 356fe6498d166..414bed4fdc914 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6209,9 +6209,9 @@ class Compiler public: void optInit(); - GenTree* Compiler::optRemoveRangeCheck(GenTreeBoundsChk* check, GenTree* comma, Statement* stmt); - GenTree* Compiler::optRemoveStandaloneRangeCheck(GenTreeBoundsChk* check, Statement* stmt); - void Compiler::optRemoveCommaBasedRangeCheck(GenTree* comma, Statement* stmt); + GenTree* optRemoveRangeCheck(GenTreeBoundsChk* check, GenTree* comma, Statement* stmt); + GenTree* optRemoveStandaloneRangeCheck(GenTreeBoundsChk* check, Statement* stmt); + void optRemoveCommaBasedRangeCheck(GenTree* comma, Statement* stmt); bool optIsRangeCheckRemovable(GenTree* tree); protected: @@ -6700,7 +6700,7 @@ class Compiler // BitVec trait information for computing CSE availability using the CSE_DataFlow algorithm. // Two bits are allocated per CSE candidate to compute CSE availability // plus an extra bit to handle the initial unvisited case. - // (See CSE_DataFlow::EndMerge for an explaination of why this is necessary) + // (See CSE_DataFlow::EndMerge for an explanation of why this is necessary.) // // The two bits per CSE candidate have the following meanings: // 11 - The CSE is available, and is also available when considering calls as killing availability. @@ -6710,6 +6710,37 @@ class Compiler // BitVecTraits* cseLivenessTraits; + //----------------------------------------------------------------------------------------------------------------- + // getCSEnum2bit: Return the normalized index to use in the EXPSET_TP for the CSE with the given CSE index. + // Each GenTree has a `gtCSEnum` field. Zero is reserved to mean this node is not a CSE, positive values indicate + // CSE uses, and negative values indicate CSE defs. The caller must pass a non-zero positive value, as from + // GET_CSE_INDEX(). + // + static unsigned genCSEnum2bit(unsigned CSEnum) + { + assert((CSEnum > 0) && (CSEnum <= MAX_CSE_CNT)); + return CSEnum - 1; + } + + //----------------------------------------------------------------------------------------------------------------- + // getCSEAvailBit: Return the bit used by CSE dataflow sets (bbCseGen, etc.) for the availability bit for a CSE. + // + static unsigned getCSEAvailBit(unsigned CSEnum) + { + return genCSEnum2bit(CSEnum) * 2; + } + + //----------------------------------------------------------------------------------------------------------------- + // getCSEAvailCrossCallBit: Return the bit used by CSE dataflow sets (bbCseGen, etc.) for the availability bit + // for a CSE considering calls as killing availability bit (see description above). + // + static unsigned getCSEAvailCrossCallBit(unsigned CSEnum) + { + return getCSEAvailBit(CSEnum) + 1; + } + + void optPrintCSEDataFlowSet(EXPSET_VALARG_TP cseDataFlowSet, bool includeBits = true); + EXPSET_TP cseCallKillsMask; // Computed once - A mask that is used to kill available CSEs at callsites /* Generic list of nodes - used by the CSE logic */ @@ -6844,9 +6875,12 @@ class Compiler return (enckey & ~TARGET_SIGN_BIT) << CSE_CONST_SHARED_LOW_BITS; } - /************************************************************************** - * Value Number based CSEs - *************************************************************************/ +/************************************************************************** + * Value Number based CSEs + *************************************************************************/ + +// String to use for formatting CSE numbers. Note that this is the positive number, e.g., from GET_CSE_INDEX(). +#define FMT_CSE "CSE #%02u" public: void optOptimizeValnumCSEs(); @@ -6854,16 +6888,15 @@ class Compiler protected: void optValnumCSE_Init(); unsigned optValnumCSE_Index(GenTree* tree, Statement* stmt); - unsigned optValnumCSE_Locate(); - void optValnumCSE_InitDataFlow(); - void optValnumCSE_DataFlow(); - void optValnumCSE_Availablity(); - void optValnumCSE_Heuristic(); + bool optValnumCSE_Locate(); + void optValnumCSE_InitDataFlow(); + void optValnumCSE_DataFlow(); + void optValnumCSE_Availablity(); + void optValnumCSE_Heuristic(); bool optDoCSE; // True when we have found a duplicate CSE tree - bool optValnumCSE_phase; // True when we are executing the optValnumCSE_phase - unsigned optCSECandidateTotal; // Grand total of CSE candidates for both Lexical and ValNum - unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's + bool optValnumCSE_phase; // True when we are executing the optOptimizeValnumCSEs() phase + unsigned optCSECandidateCount; // Count of CSE's candidates unsigned optCSEstart; // The first local variable number that is a CSE unsigned optCSEcount; // The total count of CSE's introduced. BasicBlock::weight_t optCSEweight; // The weight of the current block when we are doing PerformCSE @@ -6888,6 +6921,7 @@ class Compiler bool optConfigDisableCSE(); bool optConfigDisableCSE2(); #endif + void optOptimizeCSEs(); struct isVarAssgDsc @@ -9304,6 +9338,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool disasmWithGC; // Display GC info interleaved with disassembly. bool disDiffable; // Makes the Disassembly code 'diff-able' bool disAddr; // Display process address next to each instruction in disassembly code + bool disAlignment; // Display alignment boundaries in disassembly code bool disAsm2; // Display native code after it is generated using external disassembler bool dspOrder; // Display names of each of the methods that we ngen/jit bool dspUnwind; // Display the unwind info output diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index d05bfc6bbeb9f..ca9626ee11c1f 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -763,25 +763,6 @@ inline double getR8LittleEndian(const BYTE* ptr) return *(double*)&val; } -/***************************************************************************** - * - * Return the normalized index to use in the EXPSET_TP for the CSE with - * the given CSE index. - * Each GenTree has the following field: - * signed char gtCSEnum; // 0 or the CSE index (negated if def) - * So zero is reserved to mean this node is not a CSE - * and postive values indicate CSE uses and negative values indicate CSE defs. - * The caller of this method must pass a non-zero postive value. - * This precondition is checked by the assert on the first line of this method. - */ - -inline unsigned int genCSEnum2bit(unsigned index) -{ - assert((index > 0) && (index <= EXPSET_SZ)); - - return (index - 1); -} - #ifdef DEBUG const char* genES2str(BitVecTraits* traits, EXPSET_TP set); const char* refCntWtd2str(BasicBlock::weight_t refCntWtd); diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 352d1b2acfcd7..04e40ac282663 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -773,7 +773,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) memcpy(id, emitCurIGfreeBase, sz); #ifdef DEBUG - if (false && emitComp->verbose) // this is not useful in normal dumps (hence it is normally under if (false) + if (false && emitComp->verbose) // this is not useful in normal dumps (hence it is normally under if (false)) { // If there's an error during emission, we may want to connect the post-copy address // of an instrDesc with the pre-copy address (the one that was originally created). This @@ -843,7 +843,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) #ifdef DEBUG if (emitComp->opts.dspCode) { - printf("\n G_M%03u_IG%02u:", emitComp->compMethodID, ig->igNum); + printf("\n %s:", emitLabelString(ig)); if (emitComp->verbose) { printf(" ; offs=%06XH, funclet=%02u, bbWeight=%s", ig->igOffs, ig->igFuncIdx, @@ -1184,7 +1184,7 @@ int emitter::instrDesc::idAddrUnion::iiaGetJitDataOffset() const //---------------------------------------------------------------------------------------- // insEvaluateExecutionCost: -// Returns the estimate execution cost fortyhe current instruction +// Returns the estimated execution cost for the current instruction // // Arguments: // id - The current instruction descriptor to be evaluated @@ -1193,8 +1193,6 @@ int emitter::instrDesc::idAddrUnion::iiaGetJitDataOffset() const // calls getInsExecutionCharacteristics and uses the result // to compute an estimated execution cost // -// Notes: -// float emitter::insEvaluateExecutionCost(instrDesc* id) { insExecutionCharacteristics result = getInsExecutionCharacteristics(id); @@ -1202,8 +1200,10 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) float latency = result.insLatency; unsigned memAccessKind = result.insMemoryAccessKind; - // Check for PERFSCORE_THROUGHPUT_ILLEGAL and PERFSCORE_LATENCY_ILLEGAL - assert(throughput > 0.0); + // Check for PERFSCORE_THROUGHPUT_ILLEGAL and PERFSCORE_LATENCY_ILLEGAL. + // Note that 0.0 throughput is allowed for pseudo-instructions in the instrDesc list that won't actually + // generate code. + assert(throughput >= 0.0); assert(latency >= 0.0); if (memAccessKind == PERFSCORE_MEMORY_WRITE) @@ -1241,7 +1241,7 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult) { #ifdef DEBUG - printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), + printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsDisplayName(id), emitIfName(id->idInsFmt())); assert(!"PerfScore: unhandled instruction"); #endif @@ -1524,6 +1524,14 @@ void* emitter::emitAllocAnyInstr(size_t sz, emitAttr opsz) emitCurIGinsCnt++; +#ifdef DEBUG + if (emitComp->compCurBB != emitCurIG->lastGeneratedBlock) + { + emitCurIG->igBlocks.push_back(emitComp->compCurBB); + emitCurIG->lastGeneratedBlock = emitComp->compCurBB; + } +#endif // DEBUG + return id; } @@ -2504,7 +2512,7 @@ bool emitter::emitNoGChelper(CORINFO_METHOD_HANDLE methHnd) void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMaskTP byrefRegs, - bool isFinallyTarget DEBUG_ARG(unsigned bbNum)) + bool isFinallyTarget DEBUG_ARG(BasicBlock* block)) { /* Create a new IG if the current one is non-empty */ @@ -2533,7 +2541,7 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, #endif // defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM) #ifdef DEBUG - JITDUMP("Mapped " FMT_BB " to G_M%03u_IG%02u\n", bbNum, emitComp->compMethodID, emitCurIG->igNum); + JITDUMP("Mapped " FMT_BB " to %s\n", block->bbNum, emitLabelString(emitCurIG)); if (EMIT_GC_VERBOSE) { @@ -2548,6 +2556,7 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, printf("\n"); } #endif + return emitCurIG; } @@ -2561,6 +2570,40 @@ void* emitter::emitAddInlineLabel() return emitCurIG; } +#ifdef DEBUG + +//----------------------------------------------------------------------------- +// emitPrintLabel: Print the assembly label for an insGroup. We could use emitter::emitLabelString() +// to be consistent, but that seems silly. +// +void emitter::emitPrintLabel(insGroup* ig) +{ + printf("G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); +} + +//----------------------------------------------------------------------------- +// emitLabelString: Return label string for an insGroup, for use in debug output. +// This can be called up to four times in a single 'printf' before the static buffers +// get reused. +// +// Returns: +// String with insGroup label +// +const char* emitter::emitLabelString(insGroup* ig) +{ + const int TEMP_BUFFER_LEN = 40; + static unsigned curBuf = 0; + static char buf[4][TEMP_BUFFER_LEN]; + const char* retbuf; + + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); + retbuf = buf[curBuf]; + curBuf = (curBuf + 1) % 4; + return retbuf; +} + +#endif // DEBUG + #ifdef TARGET_ARMARCH // Does the argument location point to an IG at the end of a function or funclet? @@ -3502,7 +3545,7 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) const int TEMP_BUFFER_LEN = 40; char buff[TEMP_BUFFER_LEN]; - sprintf_s(buff, TEMP_BUFFER_LEN, "G_M%03u_IG%02u: ", emitComp->compMethodID, ig->igNum); + sprintf_s(buff, TEMP_BUFFER_LEN, "%s: ", emitLabelString(ig)); printf("%s; ", buff); // We dump less information when we're only interleaving GC info with a disassembly listing, @@ -3599,9 +3642,10 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) else { const char* separator = ""; + if (jitdump) { - printf("offs=%06XH, size=%04XH", ig->igOffs, ig->igSize); + printf("%soffs=%06XH, size=%04XH", separator, ig->igOffs, ig->igSize); separator = ", "; } @@ -3642,6 +3686,15 @@ void emitter::emitDispIG(insGroup* ig, insGroup* igPrev, bool verbose) } #endif // FEATURE_LOOP_ALIGN + if (jitdump && !ig->igBlocks.empty()) + { + for (auto block : ig->igBlocks) + { + printf("%s%s", separator, block->dspToString()); + separator = ", "; + } + } + emitDispIGflags(ig->igFlags); if (ig == emitCurIG) @@ -3733,10 +3786,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) { size_t is; -#ifdef DEBUG - size_t beforeAddr = (size_t)*dp; -#endif - /* Record the beginning offset of the instruction */ BYTE* curInsAdr = *dp; @@ -3822,52 +3871,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp) id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id)); assert(is == emitSizeOfInsDsc(id)); } - - // Print the alignment boundary - if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr) - { - size_t currAddr = (size_t)*dp; - size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1); - - // draw boundary if beforeAddr was before the lastBoundary. - if (beforeAddr < lastBoundaryAddr) - { - printf("; "); - instruction currIns = id->idIns(); - -#if defined(TARGET_XARCH) - - // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf - bool isJccAffectedIns = - ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret)); - - instrDesc* nextId = id; - castto(nextId, BYTE*) += is; - instruction nextIns = nextId->idIns(); - if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) || - (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec)) - { - isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align); - } -#else - bool isJccAffectedIns = false; -#endif - - // Indicate if instruction is at at 32B boundary or is splitted - unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1)); - if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0)) - { - printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()), - bytesCrossedBoundary); - } - else - { - printf("..............................."); - } - printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary)); - } - } -#endif +#endif // DEBUG return is; } @@ -4229,7 +4233,7 @@ void emitter::emitJumpDistBind() { if (tgtIG) { - printf(" to G_M%03u_IG%02u\n", emitComp->compMethodID, tgtIG->igNum); + printf(" to %s\n", emitLabelString(tgtIG)); } else { @@ -4687,8 +4691,7 @@ void emitter::emitLoopAlignment() // all IGs that follows this IG and participate in a loop. emitCurIG->igFlags |= IGF_LOOP_ALIGN; - JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", paddingBytes, emitComp->compMethodID, - emitCurIG->igNum); + JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); #ifdef DEBUG emitComp->loopAlignCandidates++; @@ -4973,10 +4976,10 @@ void emitter::emitLoopAlignAdjustments() alignInstr = prevAlignInstr; } - JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %u to %u.\n", emitComp->compMethodID, alignIG->igNum, - estimatedPaddingNeeded, actualPaddingNeeded); - JITDUMP("Adjusted size of G_M%03u_IG%02u from %u to %u.\n", emitComp->compMethodID, alignIG->igNum, - (alignIG->igSize + diff), alignIG->igSize); + JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, + actualPaddingNeeded); + JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff), + alignIG->igSize); } // Adjust the offset of all IGs starting from next IG until we reach the IG having the next @@ -4985,8 +4988,8 @@ void emitter::emitLoopAlignAdjustments() insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) { - JITDUMP("Adjusted offset of G_M%03u_IG%02u from %04X to %04X\n", emitComp->compMethodID, adjOffIG->igNum, - adjOffIG->igOffs, (adjOffIG->igOffs - alignBytesRemoved)); + JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs, + (adjOffIG->igOffs - alignBytesRemoved)); adjOffIG->igOffs -= alignBytesRemoved; adjOffIG = adjOffIG->igNext; } @@ -5045,8 +5048,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is already aligned if ((offset & (alignmentBoundary - 1)) == 0) { - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u already aligned at %dB boundary.'\n", - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(ig->igNext), + alignmentBoundary); return 0; } @@ -5070,8 +5073,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs // No padding if loop is big if (loopSize > maxLoopSize) { - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is big. LoopSize= %d, MaxLoopSize= %d.'\n", - emitComp->compMethodID, ig->igNext->igNum, loopSize, maxLoopSize); + JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(ig->igNext), + loopSize, maxLoopSize); return 0; } @@ -5097,17 +5100,16 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs if (nPaddingBytes == 0) { skipPadding = true; - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u already aligned at %uB boundary.'\n", - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %uB boundary.'\n", + emitLabelString(ig->igNext), alignmentBoundary); } // Check if the alignment exceeds new maxPadding limit else if (nPaddingBytes > nMaxPaddingBytes) { skipPadding = true; - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " + JITDUMP(";; Skip alignment: 'Loop at %s PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " "AlignmentBoundary= %dB.'\n", - emitComp->compMethodID, ig->igNext->igNum, nPaddingBytes, nMaxPaddingBytes, loopSize, - alignmentBoundary); + emitLabelString(ig->igNext), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); } } @@ -5129,8 +5131,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs else { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is aligned to fit in %d blocks of %d chunks.'\n", - emitComp->compMethodID, ig->igNext->igNum, minBlocksNeededForLoop, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", + emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); } } } @@ -5158,13 +5160,13 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs else { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. - JITDUMP(";; Skip alignment: 'Loop at G_M%03u_IG%02u is aligned to fit in %d blocks of %d chunks.'\n", - emitComp->compMethodID, ig->igNext->igNum, minBlocksNeededForLoop, alignmentBoundary); + JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", + emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); } } - JITDUMP(";; Calculated padding to add %d bytes to align G_M%03u_IG%02u at %dB boundary.\n", paddingToAdd, - emitComp->compMethodID, ig->igNext->igNum, alignmentBoundary); + JITDUMP(";; Calculated padding to add %d bytes to align %s at %dB boundary.\n", paddingToAdd, + emitLabelString(ig->igNext), alignmentBoundary); // Either no padding is added because it is too expensive or the offset gets aligned // to the alignment boundary @@ -5846,7 +5848,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, } else { - printf("\nG_M%03u_IG%02u:", emitComp->compMethodID, ig->igNum); + printf("\n%s:", emitLabelString(ig)); if (!emitComp->opts.disDiffable) { printf(" ;; offset=%04XH", emitCurCodeOffs(cp)); @@ -5963,9 +5965,97 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, emitCurIG = ig; - for (unsigned cnt = ig->igInsCnt; cnt; cnt--) + for (unsigned cnt = ig->igInsCnt; cnt > 0; cnt--) { +#ifdef DEBUG + size_t curInstrAddr = (size_t)cp; + instrDesc* curInstrDesc = id; +#endif + castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp); + +#ifdef DEBUG + // Print the alignment boundary + if ((emitComp->opts.disAsm || emitComp->verbose) && (emitComp->opts.disAddr || emitComp->opts.disAlignment)) + { + size_t afterInstrAddr = (size_t)cp; + instruction curIns = curInstrDesc->idIns(); + bool isJccAffectedIns = false; + +#if defined(TARGET_XARCH) + + // Determine if this instruction is part of a set that matches the Intel jcc erratum characteristic + // described here: + // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf + // This is the case when a jump instruction crosses a 32-byte boundary, or ends on a 32-byte boundary. + // "Jump instruction" in this case includes conditional jump (jcc), macro-fused op-jcc (where 'op' is + // one of cmp, test, add, sub, and, inc, or dec), direct unconditional jump, indirect jump, + // direct/indirect call, and return. + + size_t jccAlignBoundary = 32; + size_t jccAlignBoundaryMask = jccAlignBoundary - 1; + size_t jccLastBoundaryAddr = afterInstrAddr & ~jccAlignBoundaryMask; + + if (curInstrAddr < jccLastBoundaryAddr) + { + isJccAffectedIns = IsJccInstruction(curIns) || IsJmpInstruction(curIns) || (curIns == INS_call) || + (curIns == INS_ret); + + // For op-Jcc there are two cases: (1) curIns is the jcc, in which case the above condition + // already covers us. (2) curIns is the `op` and the next instruction is the `jcc`. Note that + // we will never have a `jcc` as the first instruction of a group, so we don't need to worry + // about looking ahead to the next group after a an `op` of `op-Jcc`. + + if (!isJccAffectedIns && (cnt > 1)) + { + // The current `id` is valid, namely, there is another instruction in this group. + instruction nextIns = id->idIns(); + if (((curIns == INS_cmp) || (curIns == INS_test) || (curIns == INS_add) || + (curIns == INS_sub) || (curIns == INS_and) || (curIns == INS_inc) || + (curIns == INS_dec)) && + IsJccInstruction(nextIns)) + { + isJccAffectedIns = true; + } + } + + if (isJccAffectedIns) + { + unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & jccAlignBoundaryMask); + printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d ; jcc erratum) %dB boundary " + "...............................\n", + codeGen->genInsDisplayName(curInstrDesc), bytesCrossedBoundary, jccAlignBoundary); + } + } + +#endif // TARGET_XARCH + + // Jcc affected instruction boundaries were printed above; handle other cases here. + if (!isJccAffectedIns) + { + size_t alignBoundaryMask = (size_t)emitComp->opts.compJitAlignLoopBoundary - 1; + size_t lastBoundaryAddr = afterInstrAddr & ~alignBoundaryMask; + + // draw boundary if beforeAddr was before the lastBoundary. + if (curInstrAddr < lastBoundaryAddr) + { + // Indicate if instruction is at the alignment boundary or is split + unsigned bytesCrossedBoundary = (unsigned)(afterInstrAddr & alignBoundaryMask); + if (bytesCrossedBoundary != 0) + { + printf("; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", + codeGen->genInsDisplayName(curInstrDesc), bytesCrossedBoundary); + } + else + { + printf("; ..............................."); + } + printf(" %dB boundary ...............................\n", + emitComp->opts.compJitAlignLoopBoundary); + } + } + } +#endif // DEBUG } #ifdef DEBUG @@ -6808,11 +6898,8 @@ void emitter::emitDispDataSec(dataSecDsc* section) BasicBlock* block = reinterpret_cast(data->dsCont)[i]; insGroup* ig = static_cast(emitCodeGetCookie(block)); - const char* blockLabelFormat = "G_M%03u_IG%02u"; - char blockLabel[64]; - char firstLabel[64]; - sprintf_s(blockLabel, _countof(blockLabel), blockLabelFormat, emitComp->compMethodID, ig->igNum); - sprintf_s(firstLabel, _countof(firstLabel), blockLabelFormat, emitComp->compMethodID, igFirst->igNum); + const char* blockLabel = emitLabelString(ig); + const char* firstLabel = emitLabelString(igFirst); if (isRelative) { @@ -7932,11 +8019,6 @@ insGroup* emitter::emitAllocIG() ig->igSelf = ig; #endif -#if defined(DEBUG) || defined(LATE_DISASM) - ig->igWeight = getCurrentBlockWeight(); - ig->igPerfScore = 0.0; -#endif - #if EMITTER_STATS emitTotalIGcnt += 1; emitTotalIGsize += sz; @@ -7974,6 +8056,11 @@ void emitter::emitInitIG(insGroup* ig) ig->igFlags = 0; +#if defined(DEBUG) || defined(LATE_DISASM) + ig->igWeight = getCurrentBlockWeight(); + ig->igPerfScore = 0.0; +#endif + /* Zero out some fields to avoid printing garbage in JitDumps. These really only need to be set in DEBUG, but do it in all cases to make sure we act the same in non-DEBUG builds. @@ -7986,6 +8073,12 @@ void emitter::emitInitIG(insGroup* ig) #if FEATURE_LOOP_ALIGN ig->igLoopBackEdge = nullptr; #endif + +#ifdef DEBUG + ig->lastGeneratedBlock = nullptr; + // Explicitly call the constructor, since IGs don't actually have a constructor. + ig->igBlocks.jitstd::list::list(emitComp->getAllocator(CMK_LoopOpt)); +#endif } /***************************************************************************** @@ -8050,6 +8143,11 @@ void emitter::emitNxtIG(bool extend) // We've created a new IG; no need to force another one. emitForceNewIG = false; + +#ifdef DEBUG + // We haven't written any code into the IG yet, so clear our record of the last block written to the IG. + emitCurIG->lastGeneratedBlock = nullptr; +#endif } /***************************************************************************** @@ -8584,7 +8682,7 @@ const char* emitter::emitOffsetToLabel(unsigned offs) if (ig->igOffs == offs) { // Found it! - sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "G_M%03u_IG%02u", emitComp->compMethodID, ig->igNum); + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "%s", emitLabelString(ig)); retbuf = buf[curBuf]; curBuf = (curBuf + 1) % 4; return retbuf; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 666201fc98bf0..e4bde18562f64 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -247,6 +247,11 @@ struct insGroup double igPerfScore; // The PerfScore for this insGroup #endif +#ifdef DEBUG + BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. + jitstd::list igBlocks; // All the blocks that generated code into this insGroup. +#endif + UNATIVE_OFFSET igNum; // for ordering (and display) purposes UNATIVE_OFFSET igOffs; // offset of this group within method unsigned int igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.) @@ -1229,6 +1234,8 @@ class emitter #define PERFSCORE_THROUGHPUT_ILLEGAL -1024.0f +#define PERFSCORE_THROUGHPUT_ZERO 0.0f // Only used for pseudo-instructions that don't generate code + #define PERFSCORE_THROUGHPUT_6X (1.0f / 6.0f) // Hextuple issue #define PERFSCORE_THROUGHPUT_5X 0.20f // Pentuple issue #define PERFSCORE_THROUGHPUT_4X 0.25f // Quad issue @@ -1902,13 +1909,18 @@ class emitter void* emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMaskTP byrefRegs, - bool isFinallyTarget = false DEBUG_ARG(unsigned bbNum = 0)); + bool isFinallyTarget = false DEBUG_ARG(BasicBlock* block = nullptr)); // Same as above, except the label is added and is conceptually "inline" in // the current block. Thus it extends the previous block and the emitter // continues to track GC info as if there was no label. void* emitAddInlineLabel(); +#ifdef DEBUG + void emitPrintLabel(insGroup* ig); + const char* emitLabelString(insGroup* ig); +#endif + #ifdef TARGET_ARMARCH void emitGetInstrDescs(insGroup* ig, instrDesc** id, int* insCnt); diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp index 6953df5b49a7c..5b01adbd14a5d 100644 --- a/src/coreclr/jit/emitarm.cpp +++ b/src/coreclr/jit/emitarm.cpp @@ -7292,7 +7292,7 @@ void emitter::emitDispInsHelp( lab = (insGroup*)emitCodeGetCookie(*bbp++); assert(lab); - printf("\n DD G_M%03u_IG%02u", emitComp->compMethodID, lab->igNum); + printf("\n DD %s", emitLabelString(lab)); } while (--cnt); } } @@ -7601,7 +7601,7 @@ void emitter::emitDispInsHelp( case IF_T2_M1: // Load Label emitDispReg(id->idReg1(), attr, true); if (id->idIsBound()) - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); else printf("L_M%03u_" FMT_BB, emitComp->compMethodID, id->idAddr()->iiaBBlabel->bbNum); break; @@ -7646,7 +7646,7 @@ void emitter::emitDispInsHelp( } } else if (id->idIsBound()) - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); else printf("L_M%03u_" FMT_BB, emitComp->compMethodID, id->idAddr()->iiaBBlabel->bbNum); } diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 37e19a219d265..84c49d94723d4 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -12286,7 +12286,7 @@ void emitter::emitDispIns( } else if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12324,7 +12324,7 @@ void emitter::emitDispIns( emitDispReg(id->idReg1(), size, true); if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12338,7 +12338,7 @@ void emitter::emitDispIns( emitDispImm(emitGetInsSC(id), true); if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { @@ -12463,7 +12463,7 @@ void emitter::emitDispIns( } else if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } else { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index a0a5e3283d4e9..64711d645e10f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -24,37 +24,37 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "emit.h" #include "codegen.h" -bool IsSSEInstruction(instruction ins) +bool emitter::IsSSEInstruction(instruction ins) { return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_SSE_INSTRUCTION); } -bool IsSSEOrAVXInstruction(instruction ins) +bool emitter::IsSSEOrAVXInstruction(instruction ins) { return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } -bool IsAVXOnlyInstruction(instruction ins) +bool emitter::IsAVXOnlyInstruction(instruction ins) { return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } -bool IsFMAInstruction(instruction ins) +bool emitter::IsFMAInstruction(instruction ins) { return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION); } -bool IsAVXVNNIInstruction(instruction ins) +bool emitter::IsAVXVNNIInstruction(instruction ins) { return (ins >= INS_FIRST_AVXVNNI_INSTRUCTION) && (ins <= INS_LAST_AVXVNNI_INSTRUCTION); } -bool IsBMIInstruction(instruction ins) +bool emitter::IsBMIInstruction(instruction ins) { return (ins >= INS_FIRST_BMI_INSTRUCTION) && (ins <= INS_LAST_BMI_INSTRUCTION); } -regNumber getBmiRegNumber(instruction ins) +regNumber emitter::getBmiRegNumber(instruction ins) { switch (ins) { @@ -81,7 +81,7 @@ regNumber getBmiRegNumber(instruction ins) } } -regNumber getSseShiftRegNumber(instruction ins) +regNumber emitter::getSseShiftRegNumber(instruction ins) { switch (ins) { @@ -123,7 +123,7 @@ regNumber getSseShiftRegNumber(instruction ins) } } -bool emitter::IsAVXInstruction(instruction ins) +bool emitter::IsAVXInstruction(instruction ins) const { return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); } @@ -438,7 +438,7 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) // Returns true if this instruction requires a VEX prefix // All AVX instructions require a VEX prefix -bool emitter::TakesVexPrefix(instruction ins) +bool emitter::TakesVexPrefix(instruction ins) const { // special case vzeroupper as it requires 2-byte VEX prefix // special case the fencing, movnti and the prefetch instructions as they never take a VEX prefix @@ -514,7 +514,7 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att } // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix -bool TakesRexWPrefix(instruction ins, emitAttr attr) +bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) { // Because the current implementation of AVX does not have a way to distinguish between the register // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are @@ -4292,6 +4292,38 @@ bool emitter::IsMovInstruction(instruction ins) } } +//------------------------------------------------------------------------ +// IsJccInstruction: Determine if an instruction is a conditional jump instruction. +// +// Arguments: +// ins -- The instruction being checked +// +// Return Value: +// true if the instruction qualifies; otherwise, false +// +bool emitter::IsJccInstruction(instruction ins) +{ + return ((ins >= INS_jo) && (ins <= INS_jg)) || ((ins >= INS_l_jo) && (ins <= INS_l_jg)); +} + +//------------------------------------------------------------------------ +// IsJmpInstruction: Determine if an instruction is a jump instruction but NOT a conditional jump instruction. +// +// Arguments: +// ins -- The instruction being checked +// +// Return Value: +// true if the instruction qualifies; otherwise, false +// +bool emitter::IsJmpInstruction(instruction ins) +{ + return +#ifdef TARGET_AMD64 + (ins == INS_rex_jmp) || +#endif + (ins == INS_i_jmp) || (ins == INS_jmp) || (ins == INS_l_jmp); +} + //---------------------------------------------------------------------------------------- // IsRedundantMov: // Check if the current `mov` instruction is redundant and can be omitted. @@ -8452,7 +8484,7 @@ void emitter::emitDispAddrMode(instrDesc* id, bool noDetail) lab = (insGroup*)emitCodeGetCookie(*bbp++); assert(lab); - printf("\n D" SIZE_LETTER " G_M%03u_IG%02u", emitComp->compMethodID, lab->igNum); + printf("\n D" SIZE_LETTER " %s", emitLabelString(lab)); } while (--cnt); } } @@ -8684,22 +8716,16 @@ void emitter::emitDispIns( /* Display the instruction name */ - sstr = codeGen->genInsName(ins); + sstr = codeGen->genInsDisplayName(id); + printf(" %-9s", sstr); - if (IsAVXInstruction(ins) && !IsBMIInstruction(ins)) - { - printf(" v%-8s", sstr); - } - else - { - printf(" %-9s", sstr); - } #ifndef HOST_UNIX - if (strnlen_s(sstr, 10) >= 8) + if (strnlen_s(sstr, 10) >= 9) #else // HOST_UNIX - if (strnlen(sstr, 10) >= 8) + if (strnlen(sstr, 10) >= 9) #endif // HOST_UNIX { + // Make sure there's at least one space after the instruction name, for very long instruction names. printf(" "); } @@ -9649,7 +9675,7 @@ void emitter::emitDispIns( } else { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + emitPrintLabel(id->idAddr()->iiaIGlabel); } } else @@ -14669,6 +14695,17 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (ins) { case INS_align: +#if FEATURE_LOOP_ALIGN + if (id->idCodeSize() == 0) + { + // We're not going to generate any instruction, so it doesn't count for PerfScore. + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; + result.insLatency = PERFSCORE_LATENCY_ZERO; + break; + } +#endif + FALLTHROUGH; + case INS_nop: case INS_int3: assert(memFmt == IF_NONE); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 8260445686be0..f952a6f649f44 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -83,7 +83,15 @@ code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); -bool IsAVXInstruction(instruction ins); +static bool IsSSEInstruction(instruction ins); +static bool IsSSEOrAVXInstruction(instruction ins); +static bool IsAVXOnlyInstruction(instruction ins); +static bool IsFMAInstruction(instruction ins); +static bool IsAVXVNNIInstruction(instruction ins); +static bool IsBMIInstruction(instruction ins); +static regNumber getBmiRegNumber(instruction ins); +static regNumber getSseShiftRegNumber(instruction ins); +bool IsAVXInstruction(instruction ins) const; code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); code_t AddRexWPrefix(instruction ins, code_t code); @@ -98,6 +106,9 @@ static bool IsMovInstruction(instruction ins); bool IsRedundantMov( instruction ins, insFormat fmt, emitAttr size, regNumber dst, regNumber src, bool canIgnoreSideEffects); +static bool IsJccInstruction(instruction ins); +static bool IsJmpInstruction(instruction ins); + bool AreUpper32BitsZero(regNumber reg); bool AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, genTreeOps treeOps); @@ -116,7 +127,8 @@ bool hasRexPrefix(code_t code) #define VEX_PREFIX_MASK_3BYTE 0xFF000000000000ULL #define VEX_PREFIX_CODE_3BYTE 0xC4000000000000ULL -bool TakesVexPrefix(instruction ins); +bool TakesVexPrefix(instruction ins) const; +static bool TakesRexWPrefix(instruction ins, emitAttr attr); // Returns true if the instruction encoding already contains VEX prefix bool hasVexPrefix(code_t code) @@ -142,7 +154,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr } bool useVEXEncodings; -bool UseVEXEncoding() +bool UseVEXEncoding() const { return useVEXEncodings; } diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 3b1b1739919d5..92a82c346a545 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1620,6 +1620,9 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext) } } bNext->bbPreds = nullptr; + + // `block` can no longer be a loop pre-header (if it was before). + block->bbFlags &= ~BBF_LOOP_PREHEADER; } else { diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b56372d18d28f..ae8012322ddef 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -10291,7 +10291,7 @@ void Compiler::gtDispNode(GenTree* tree, IndentStack* indentStack, __in __in_z _ { if (IS_CSE_INDEX(tree->gtCSEnum)) { - printf("CSE #%02d (%s)", GET_CSE_INDEX(tree->gtCSEnum), (IS_CSE_USE(tree->gtCSEnum) ? "use" : "def")); + printf(FMT_CSE " (%s)", GET_CSE_INDEX(tree->gtCSEnum), (IS_CSE_USE(tree->gtCSEnum) ? "use" : "def")); } else { diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b043956468848..aed7bef018fbb 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -494,6 +494,7 @@ enum GenTreeFlags : unsigned int GTF_INX_RNGCHK = 0x80000000, // GT_INDEX/GT_INDEX_ADDR -- the array reference should be range-checked. GTF_INX_STRING_LAYOUT = 0x40000000, // GT_INDEX -- this uses the special string array layout + GTF_INX_NOFAULT = 0x20000000, // GT_INDEX -- the INDEX does not throw an exception (morph to GTF_IND_NONFAULTING) GTF_IND_TGT_NOT_HEAP = 0x80000000, // GT_IND -- the target is not on the heap GTF_IND_VOLATILE = 0x40000000, // GT_IND -- the load or store must use volatile sematics (this is a nop on X86) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 9ce10458fb08e..752626f20184d 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -24,11 +24,12 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /*****************************************************************************/ #ifdef DEBUG -/***************************************************************************** - * - * Returns the string representation of the given CPU instruction. - */ - +//----------------------------------------------------------------------------- +// genInsName: Returns the string representation of the given CPU instruction, as +// it exists in the instruction table. Note that some architectures don't encode the +// name completely in the table: xarch sometimes prepends a "v", and arm sometimes +// appends a "s". Use `genInsDisplayName()` to get a fully-formed name. +// const char* CodeGen::genInsName(instruction ins) { // clang-format off @@ -77,36 +78,36 @@ const char* CodeGen::genInsName(instruction ins) return insNames[ins]; } -void __cdecl CodeGen::instDisp(instruction ins, bool noNL, const char* fmt, ...) +//----------------------------------------------------------------------------- +// genInsDisplayName: Get a fully-formed instruction display name. This only handles +// the xarch case of prepending a "v", not the arm case of appending an "s". +// This can be called up to four times in a single 'printf' before the static buffers +// get reused. +// +// Returns: +// String with instruction name +// +const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) { - if (compiler->opts.dspCode) - { - /* Display the instruction offset within the emit block */ - - // printf("[%08X:%04X]", GetEmitter().emitCodeCurBlock(), GetEmitter().emitCodeOffsInBlock()); - - /* Display the FP stack depth (before the instruction is executed) */ - - // printf("[FP=%02u] ", genGetFPstkLevel()); + instruction ins = id->idIns(); + const char* insName = genInsName(ins); - /* Display the instruction mnemonic */ - printf(" "); - - printf(" %-8s", genInsName(ins)); - - if (fmt) - { - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); - } +#ifdef TARGET_XARCH + const int TEMP_BUFFER_LEN = 40; + static unsigned curBuf = 0; + static char buf[4][TEMP_BUFFER_LEN]; + const char* retbuf; - if (!noNL) - { - printf("\n"); - } + if (GetEmitter()->IsAVXInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + { + sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); + retbuf = buf[curBuf]; + curBuf = (curBuf + 1) % 4; + return retbuf; } +#endif // TARGET_XARCH + + return insName; } /*****************************************************************************/ diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index cc10adc0b8809..4c573b7575257 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -68,6 +68,9 @@ CONFIG_INTEGER(JitAlignLoopAdaptive, W("JitAlignLoopAdaptive"), 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. +// Print the alignment boundaries in disassembly. +CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0) + CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0) CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1) CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1) // Uses only ASCII characters in tree dumps diff --git a/src/coreclr/jit/jitstd/algorithm.h b/src/coreclr/jit/jitstd/algorithm.h index 000639a5a1de8..9fa6fbb94dd54 100644 --- a/src/coreclr/jit/jitstd/algorithm.h +++ b/src/coreclr/jit/jitstd/algorithm.h @@ -102,9 +102,9 @@ void quick_sort(RandomAccessIterator first, RandomAccessIterator last, Less less // // It's not possible for newFirst to go past the end of the sort range: // - If newFirst reaches the pivot before newLast then the pivot is - // swapped to the right and we'll stop again when we reach it. + // swapped to the right and we'll stop again when we reach it. // - If newLast reaches the pivot before newFirst then the pivot is - // swapped to the left and the value at newFirst will take its place + // swapped to the left and the value at newFirst will take its place // to the right so less(newFirst, pivot) will again be false when the // old pivot's position is reached. do diff --git a/src/coreclr/jit/loopcloning.cpp b/src/coreclr/jit/loopcloning.cpp index 0fe22420f783e..c3991e663e1d9 100644 --- a/src/coreclr/jit/loopcloning.cpp +++ b/src/coreclr/jit/loopcloning.cpp @@ -12,6 +12,40 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "jitpch.h" +#ifdef DEBUG + +//-------------------------------------------------------------------------------------------------- +// ArrIndex::Print - debug print an ArrIndex struct in form: `V01[V02][V03]`. +// +// Arguments: +// dim (Optional) Print up to but not including this dimension. Default: print all dimensions. +// +void ArrIndex::Print(unsigned dim /* = -1 */) +{ + printf("V%02d", arrLcl); + for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) + { + printf("[V%02d]", indLcls.Get(i)); + } +} + +//-------------------------------------------------------------------------------------------------- +// ArrIndex::PrintBoundsCheckNodes - debug print an ArrIndex struct bounds check node tree ids in +// form: `[000125][000113]`. +// +// Arguments: +// dim (Optional) Print up to but not including this dimension. Default: print all dimensions. +// +void ArrIndex::PrintBoundsCheckNodes(unsigned dim /* = -1 */) +{ + for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) + { + Compiler::printTreeID(bndsChks.Get(i)); + } +} + +#endif // DEBUG + //-------------------------------------------------------------------------------------------------- // ToGenTree - Convert an arrLen operation into a gentree node. // @@ -39,11 +73,26 @@ GenTree* LC_Array::ToGenTree(Compiler* comp, BasicBlock* bb) { arr = comp->gtNewIndexRef(TYP_REF, arr, comp->gtNewLclvNode(arrIndex->indLcls[i], comp->lvaTable[arrIndex->indLcls[i]].lvType)); + + // Clear the range check flag and mark the index as non-faulting: we guarantee that all necessary range + // checking has already been done by the time this array index expression is invoked. + arr->gtFlags &= ~(GTF_INX_RNGCHK | GTF_EXCEPT); + arr->gtFlags |= GTF_INX_NOFAULT; } // If asked for arrlen invoke arr length operator. if (oper == ArrLen) { GenTree* arrLen = comp->gtNewArrLen(TYP_INT, arr, OFFSETOF__CORINFO_Array__length, bb); + + // We already guaranteed (by a sequence of preceding checks) that the array length operator will not + // throw an exception because we null checked the base array. + // So, we should be able to do the following: + // arrLen->gtFlags &= ~GTF_EXCEPT; + // arrLen->gtFlags |= GTF_IND_NONFAULTING; + // However, we then end up with a mix of non-faulting array length operators as well as normal faulting + // array length operators in the slow-path of the cloned loops. CSE doesn't keep these separate, so bails + // out on creating CSEs on this very useful type of CSE, leading to CQ losses in the cloned loop fast path. + // TODO-CQ: fix this. return arrLen; } else @@ -395,26 +444,30 @@ void LoopCloneContext::PrintBlockConditions(unsigned loopNum) { printf("Block conditions:\n"); - JitExpandArrayStack*>* levelCond = blockConditions[loopNum]; - if (levelCond == nullptr || levelCond->Size() == 0) + JitExpandArrayStack*>* blockConds = blockConditions[loopNum]; + if (blockConds == nullptr || blockConds->Size() == 0) { printf("No block conditions\n"); return; } - for (unsigned i = 0; i < levelCond->Size(); ++i) + for (unsigned i = 0; i < blockConds->Size(); ++i) + { + PrintBlockLevelConditions(i, (*blockConds)[i]); + } +} +void LoopCloneContext::PrintBlockLevelConditions(unsigned level, JitExpandArrayStack* levelCond) +{ + printf("%d = ", level); + for (unsigned j = 0; j < levelCond->Size(); ++j) { - printf("%d = {", i); - for (unsigned j = 0; j < ((*levelCond)[i])->Size(); ++j) + if (j != 0) { - if (j != 0) - { - printf(" & "); - } - (*((*levelCond)[i]))[j].Print(); + printf(" & "); } - printf("}\n"); + (*levelCond)[j].Print(); } + printf("\n"); } #endif @@ -650,19 +703,19 @@ void LoopCloneContext::PrintConditions(unsigned loopNum) { if (conditions[loopNum] == nullptr) { - JITDUMP("NO conditions"); + printf("NO conditions"); return; } if (conditions[loopNum]->Size() == 0) { - JITDUMP("Conditions were optimized away! Will always take cloned path."); + printf("Conditions were optimized away! Will always take cloned path."); return; } for (unsigned i = 0; i < conditions[loopNum]->Size(); ++i) { if (i != 0) { - JITDUMP(" & "); + printf(" & "); } (*conditions[loopNum])[i].Print(); } @@ -711,6 +764,9 @@ void LoopCloneContext::CondToStmtInBlock(Compiler* comp comp->fgInsertStmtAtEnd(block, stmt); // Remorph. + JITDUMP("Loop cloning condition tree before morphing:\n"); + DBEXEC(comp->verbose, comp->gtDispTree(jmpTrueTree)); + JITDUMP("\n"); comp->fgMorphBlockStmt(block, stmt DEBUGARG("Loop cloning condition")); } @@ -965,16 +1021,15 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext for (unsigned i = 0; i < optInfos->Size(); ++i) { - LcOptInfo* optInfo = optInfos->GetRef(i); + LcOptInfo* optInfo = optInfos->Get(i); switch (optInfo->GetOptType()) { case LcOptInfo::LcJaggedArray: { // limit <= arrLen LcJaggedArrayOptInfo* arrIndexInfo = optInfo->AsLcJaggedArrayOptInfo(); - LC_Array arrLen(LC_Array::Jagged, &arrIndexInfo->arrIndex, arrIndexInfo->dim, LC_Array::ArrLen); - LC_Ident arrLenIdent = LC_Ident(arrLen); - + LC_Array arrLen(LC_Array::Jagged, &arrIndexInfo->arrIndex, arrIndexInfo->dim, LC_Array::ArrLen); + LC_Ident arrLenIdent = LC_Ident(arrLen); LC_Condition cond(GT_LE, LC_Expr(ident), LC_Expr(arrLenIdent)); context->EnsureConditions(loopNum)->Push(cond); @@ -1000,9 +1055,9 @@ bool Compiler::optDeriveLoopCloningConditions(unsigned loopNum, LoopCloneContext return false; } } - JITDUMP("Conditions: ("); + JITDUMP("Conditions: "); DBEXEC(verbose, context->PrintConditions(loopNum)); - JITDUMP(")\n"); + JITDUMP("\n"); return true; } return false; @@ -1164,10 +1219,6 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con printf("Deref condition tree:\n"); for (unsigned i = 0; i < nodes.Size(); ++i) { - if (i != 0) - { - printf(","); - } nodes[i]->Print(); printf("\n"); } @@ -1176,6 +1227,7 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con if (maxRank == -1) { + JITDUMP("> maxRank undefined\n"); return false; } @@ -1184,12 +1236,13 @@ bool Compiler::optComputeDerefConditions(unsigned loopNum, LoopCloneContext* con // So add 1 after rank * 2. unsigned condBlocks = (unsigned)maxRank * 2 + 1; - // Heuristic to not create too many blocks. - // REVIEW: due to the definition of `condBlocks`, above, the effective max is 3 blocks, meaning - // `maxRank` of 1. Question: should the heuristic allow more blocks to be created in some situations? - // REVIEW: make this based on a COMPlus configuration? - if (condBlocks > 4) + // Heuristic to not create too many blocks. Defining as 3 allows, effectively, loop cloning on + // doubly-nested loops. + // REVIEW: make this based on a COMPlus configuration, at least for debug? + const unsigned maxAllowedCondBlocks = 3; + if (condBlocks > maxAllowedCondBlocks) { + JITDUMP("> Too many condition blocks (%u > %u)\n", condBlocks, maxAllowedCondBlocks); return false; } @@ -1254,14 +1307,60 @@ void Compiler::optPerformStaticOptimizations(unsigned loopNum, LoopCloneContext* JitExpandArrayStack* optInfos = context->GetLoopOptInfo(loopNum); for (unsigned i = 0; i < optInfos->Size(); ++i) { - LcOptInfo* optInfo = optInfos->GetRef(i); + LcOptInfo* optInfo = optInfos->Get(i); switch (optInfo->GetOptType()) { case LcOptInfo::LcJaggedArray: { LcJaggedArrayOptInfo* arrIndexInfo = optInfo->AsLcJaggedArrayOptInfo(); compCurBB = arrIndexInfo->arrIndex.useBlock; - optRemoveCommaBasedRangeCheck(arrIndexInfo->arrIndex.bndsChks[arrIndexInfo->dim], arrIndexInfo->stmt); + + // Remove all bounds checks for this array up to (and including) `arrIndexInfo->dim`. So, if that is 1, + // Remove rank 0 and 1 bounds checks. + + for (unsigned dim = 0; dim <= arrIndexInfo->dim; dim++) + { + GenTree* bndsChkNode = arrIndexInfo->arrIndex.bndsChks[dim]; + +#ifdef DEBUG + if (verbose) + { + printf("Remove bounds check "); + printTreeID(bndsChkNode->gtGetOp1()); + printf(" for " FMT_STMT ", dim% d, ", arrIndexInfo->stmt->GetID(), dim); + arrIndexInfo->arrIndex.Print(); + printf(", bounds check nodes: "); + arrIndexInfo->arrIndex.PrintBoundsCheckNodes(); + printf("\n"); + } +#endif // DEBUG + + if (bndsChkNode->gtGetOp1()->OperIsBoundsCheck()) + { + // This COMMA node will only represent a bounds check if we've haven't already removed this + // bounds check in some other nesting cloned loop. For example, consider: + // for (i = 0; i < x; i++) + // for (j = 0; j < y; j++) + // a[i][j] = i + j; + // If the outer loop is cloned first, it will remove the a[i] bounds check from the optimized + // path. Later, when the inner loop is cloned, we want to remove the a[i][j] bounds check. If + // we clone the inner loop, we know that the a[i] bounds check isn't required because we'll add + // it to the loop cloning conditions. On the other hand, we can clone a loop where we get rid of + // the nested bounds check but nobody has gotten rid of the outer bounds check. As before, we + // know the outer bounds check is not needed because it's been added to the cloning conditions, + // so we can get rid of the bounds check here. + // + optRemoveCommaBasedRangeCheck(bndsChkNode, arrIndexInfo->stmt); + } + else + { + JITDUMP(" Bounds check already removed\n"); + + // If the bounds check node isn't there, it better have been converted to a GT_NOP. + assert(bndsChkNode->gtGetOp1()->OperIs(GT_NOP)); + } + } + DBEXEC(dynamicPath, optDebugLogLoopCloning(arrIndexInfo->arrIndex.useBlock, arrIndexInfo->stmt)); } break; @@ -1474,8 +1573,9 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, BasicBlock* head, BasicBlock* slowHead) { - JITDUMP("Inserting loop cloning conditions\n"); + JITDUMP("Inserting loop " FMT_LP " loop choice conditions\n", loopNum); assert(context->HasBlockConditions(loopNum)); + assert(head->bbJumpKind == BBJ_COND); BasicBlock* curCond = head; JitExpandArrayStack*>* levelCond = context->GetBlockConditions(loopNum); @@ -1484,10 +1584,15 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, bool isHeaderBlock = (curCond == head); // Flip the condition if header block. + JITDUMP("Adding loop " FMT_LP " level %u block conditions to " FMT_BB "\n ", loopNum, i, curCond->bbNum); + DBEXEC(verbose, context->PrintBlockLevelConditions(i, (*levelCond)[i])); context->CondToStmtInBlock(this, *((*levelCond)[i]), curCond, /*reverse*/ isHeaderBlock); // Create each condition block ensuring wiring between them. - BasicBlock* tmp = fgNewBBafter(BBJ_COND, isHeaderBlock ? slowHead : curCond, /*extendRegion*/ true); + BasicBlock* tmp = fgNewBBafter(BBJ_COND, isHeaderBlock ? slowHead : curCond, /*extendRegion*/ true); + tmp->inheritWeight(head); + tmp->bbNatLoopNum = head->bbNatLoopNum; + curCond->bbJumpDest = isHeaderBlock ? tmp : slowHead; JITDUMP("Adding " FMT_BB " -> " FMT_BB "\n", curCond->bbNum, curCond->bbJumpDest->bbNum); @@ -1500,13 +1605,13 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, } curCond = tmp; - - curCond->inheritWeight(head); - curCond->bbNatLoopNum = head->bbNatLoopNum; - JITDUMP("Created new " FMT_BB " for new level %u\n", curCond->bbNum, i); } // Finally insert cloning conditions after all deref conditions have been inserted. + JITDUMP("Adding loop " FMT_LP " cloning conditions to " FMT_BB "\n", loopNum, curCond->bbNum); + JITDUMP(" "); + DBEXEC(verbose, context->PrintConditions(loopNum)); + JITDUMP("\n"); context->CondToStmtInBlock(this, *(context->GetConditions(loopNum)), curCond, /*reverse*/ false); return curCond; } @@ -2222,10 +2327,12 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop #ifdef DEBUG if (verbose) { - printf("Found ArrIndex at tree "); + printf("Found ArrIndex at " FMT_BB " " FMT_STMT " tree ", arrIndex.useBlock->bbNum, info->stmt->GetID()); printTreeID(tree); printf(" which is equivalent to: "); arrIndex.Print(); + printf(", bounds check nodes: "); + arrIndex.PrintBoundsCheckNodes(); printf("\n"); } #endif @@ -2233,6 +2340,7 @@ Compiler::fgWalkResult Compiler::optCanOptimizeByLoopCloning(GenTree* tree, Loop // Check that the array object local variable is invariant within the loop body. if (!optIsStackLocalInvariant(info->loopNum, arrIndex.arrLcl)) { + JITDUMP("V%02d is not loop invariant\n", arrIndex.arrLcl); return WALK_SKIP_SUBTREES; } diff --git a/src/coreclr/jit/loopcloning.h b/src/coreclr/jit/loopcloning.h index 6cc921c520db1..c5ed53c31bad8 100644 --- a/src/coreclr/jit/loopcloning.h +++ b/src/coreclr/jit/loopcloning.h @@ -220,14 +220,8 @@ struct ArrIndex } #ifdef DEBUG - void Print(unsigned dim = -1) - { - printf("V%02d", arrLcl); - for (unsigned i = 0; i < ((dim == (unsigned)-1) ? rank : dim); ++i) - { - printf("[V%02d]", indLcls.GetRef(i)); - } - } + void Print(unsigned dim = -1); + void PrintBoundsCheckNodes(unsigned dim = -1); #endif }; @@ -260,9 +254,8 @@ struct LcOptInfo #include "loopcloningopts.h" }; - void* optInfo; OptType optType; - LcOptInfo(void* optInfo, OptType optType) : optInfo(optInfo), optType(optType) + LcOptInfo(OptType optType) : optType(optType) { } @@ -270,6 +263,7 @@ struct LcOptInfo { return optType; } + #define LC_OPT(en) \ en##OptInfo* As##en##OptInfo() \ { \ @@ -292,7 +286,7 @@ struct LcMdArrayOptInfo : public LcOptInfo ArrIndex* index; // "index" cached computation in the form of an ArrIndex representation. LcMdArrayOptInfo(GenTreeArrElem* arrElem, unsigned dim) - : LcOptInfo(this, LcMdArray), arrElem(arrElem), dim(dim), index(nullptr) + : LcOptInfo(LcMdArray), arrElem(arrElem), dim(dim), index(nullptr) { } @@ -325,7 +319,7 @@ struct LcJaggedArrayOptInfo : public LcOptInfo Statement* stmt; // "stmt" where the optimization opportunity occurs. LcJaggedArrayOptInfo(ArrIndex& arrIndex, unsigned dim, Statement* stmt) - : LcOptInfo(this, LcJaggedArray), dim(dim), arrIndex(arrIndex), stmt(stmt) + : LcOptInfo(LcJaggedArray), dim(dim), arrIndex(arrIndex), stmt(stmt) { } }; @@ -678,30 +672,24 @@ struct LoopCloneContext CompAllocator alloc; // The allocator // The array of optimization opportunities found in each loop. (loop x optimization-opportunities) - JitExpandArrayStack** optInfo; + jitstd::vector*> optInfo; // The array of conditions that influence which path to take for each loop. (loop x cloning-conditions) - JitExpandArrayStack** conditions; + jitstd::vector*> conditions; // The array of dereference conditions found in each loop. (loop x deref-conditions) - JitExpandArrayStack** derefs; + jitstd::vector*> derefs; // The array of block levels of conditions for each loop. (loop x level x conditions) - JitExpandArrayStack*>** blockConditions; + jitstd::vector*>*> blockConditions; - LoopCloneContext(unsigned loopCount, CompAllocator alloc) : alloc(alloc) + LoopCloneContext(unsigned loopCount, CompAllocator alloc) + : alloc(alloc), optInfo(alloc), conditions(alloc), derefs(alloc), blockConditions(alloc) { - optInfo = new (alloc) JitExpandArrayStack*[loopCount]; - conditions = new (alloc) JitExpandArrayStack*[loopCount]; - derefs = new (alloc) JitExpandArrayStack*[loopCount]; - blockConditions = new (alloc) JitExpandArrayStack*>*[loopCount]; - for (unsigned i = 0; i < loopCount; ++i) - { - optInfo[i] = nullptr; - conditions[i] = nullptr; - derefs[i] = nullptr; - blockConditions[i] = nullptr; - } + optInfo.resize(loopCount, nullptr); + conditions.resize(loopCount, nullptr); + derefs.resize(loopCount, nullptr); + blockConditions.resize(loopCount, nullptr); } // Evaluate conditions into a JTRUE stmt and put it in the block. Reverse condition if 'reverse' is true. @@ -739,6 +727,7 @@ struct LoopCloneContext #ifdef DEBUG // Print the block conditions for the loop. void PrintBlockConditions(unsigned loopNum); + void PrintBlockLevelConditions(unsigned level, JitExpandArrayStack* levelCond); #endif // Does the loop have block conditions? diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 2b40d8548c68f..915aee27f1f9e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -5577,8 +5577,9 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) GenTree* arrRef = asIndex->Arr(); GenTree* index = asIndex->Index(); - bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled - bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); + bool chkd = ((tree->gtFlags & GTF_INX_RNGCHK) != 0); // if false, range checking will be disabled + bool indexNonFaulting = ((tree->gtFlags & GTF_INX_NOFAULT) != 0); // if true, mark GTF_IND_NONFAULTING + bool nCSE = ((tree->gtFlags & GTF_DONT_CSE) != 0); GenTree* arrRefDefn = nullptr; // non-NULL if we need to allocate a temp for the arrRef expression GenTree* indexDefn = nullptr; // non-NULL if we need to allocate a temp for the index expression @@ -5738,9 +5739,9 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) this->compFloatingPointUsed = true; } - // We've now consumed the GTF_INX_RNGCHK, and the node + // We've now consumed the GTF_INX_RNGCHK and GTF_INX_NOFAULT, and the node // is no longer a GT_INDEX node. - tree->gtFlags &= ~GTF_INX_RNGCHK; + tree->gtFlags &= ~(GTF_INX_RNGCHK | GTF_INX_NOFAULT); tree->AsOp()->gtOp1 = addr; @@ -5748,7 +5749,7 @@ GenTree* Compiler::fgMorphArrayIndex(GenTree* tree) tree->gtFlags |= GTF_IND_ARR_INDEX; // If there's a bounds check, the indir won't fault. - if (bndsChk) + if (bndsChk || indexNonFaulting) { tree->gtFlags |= GTF_IND_NONFAULTING; } diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index b4137ec52ebbf..587349e489b55 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -166,7 +166,8 @@ Compiler::fgWalkResult Compiler::optCSE_MaskHelper(GenTree** pTree, fgWalkData* if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned cseIndex = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseBit = genCSEnum2bit(cseIndex); + // Note that we DO NOT use getCSEAvailBit() here, for the CSE_defMask/CSE_useMask + unsigned cseBit = genCSEnum2bit(cseIndex); if (IS_CSE_DEF(tree->gtCSEnum)) { BitVecOps::AddElemD(comp->cseMaskTraits, pUserData->CSE_defMask, cseBit); @@ -424,16 +425,16 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) ValueNum vnLibNorm = vnStore->VNNormalValue(vnLib); // We use the normal value number because we want the CSE candidate to - // represent all expressions that produce the same normal value number + // represent all expressions that produce the same normal value number. // We will handle the case where we have different exception sets when // promoting the candidates. // // We do this because a GT_IND will usually have a NullPtrExc entry in its // exc set, but we may have cleared the GTF_EXCEPT flag and if so, it won't - // have an NullPtrExc, or we may have assigned the value of an GT_IND + // have an NullPtrExc, or we may have assigned the value of an GT_IND // into a LCL_VAR and then read it back later. // - // When we are promoting the CSE candidates we insure that any CSE + // When we are promoting the CSE candidates we ensure that any CSE // uses that we promote have an exc set that is the same as the CSE defs // or have an empty set. And that all of the CSE defs produced the required // set of exceptions for the CSE uses. @@ -502,7 +503,7 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) key = vnLibNorm; } - // Make sure that the result of Is_Shared_Const_CSE(key) matches isSharedConst + // Make sure that the result of Is_Shared_Const_CSE(key) matches isSharedConst. // Note that when isSharedConst is true then we require that the TARGET_SIGN_BIT is set in the key // and otherwise we require that we never create a ValueNumber with the TARGET_SIGN_BIT set. // @@ -709,7 +710,6 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) C_ASSERT((signed char)MAX_CSE_CNT == MAX_CSE_CNT); unsigned CSEindex = ++optCSECandidateCount; - // EXPSET_TP CSEmask = genCSEnum2bit(CSEindex); /* Record the new CSE index in the hashDsc */ hashDsc->csdIndex = CSEindex; @@ -746,16 +746,14 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) } } -/***************************************************************************** - * - * Locate CSE candidates and assign indices to them - * return 0 if no CSE candidates were found - */ - -unsigned Compiler::optValnumCSE_Locate() +//------------------------------------------------------------------------ +// optValnumCSE_Locate: Locate CSE candidates and assign them indices. +// +// Returns: +// true if there are any CSE candidates, false otherwise +// +bool Compiler::optValnumCSE_Locate() { - // Locate CSE candidates and assign them indices - bool enableConstCSE = true; int configValue = JitConfig.JitConstCSE(); @@ -871,14 +869,14 @@ unsigned Compiler::optValnumCSE_Locate() if (!optDoCSE) { - return 0; + return false; } /* We're finished building the expression lookup table */ optCSEstop(); - return 1; + return true; } //------------------------------------------------------------------------ @@ -890,7 +888,7 @@ unsigned Compiler::optValnumCSE_Locate() // // Arguments: // compare - The compare node to check - +// void Compiler::optCseUpdateCheckedBoundMap(GenTree* compare) { assert(compare->OperIsCompare()); @@ -999,9 +997,9 @@ void Compiler::optValnumCSE_InitDataFlow() // Init traits and cseCallKillsMask bitvectors. cseLivenessTraits = new (getAllocator(CMK_CSE)) BitVecTraits(bitCount, this); cseCallKillsMask = BitVecOps::MakeEmpty(cseLivenessTraits); - for (unsigned inx = 0; inx < optCSECandidateCount; inx++) + for (unsigned inx = 1; inx <= optCSECandidateCount; inx++) { - unsigned cseAvailBit = inx * 2; + unsigned cseAvailBit = getCSEAvailBit(inx); // a one preserves availability and a zero kills the availability // we generate this kind of bit pattern: 101010101010 @@ -1047,7 +1045,7 @@ void Compiler::optValnumCSE_InitDataFlow() block->bbCseGen = BitVecOps::MakeEmpty(cseLivenessTraits); } - // We walk the set of CSE candidates and set the bit corresponsing to the CSEindex + // We walk the set of CSE candidates and set the bit corresponding to the CSEindex // in the block's bbCseGen bitset // for (unsigned inx = 0; inx < optCSECandidateCount; inx++) @@ -1060,16 +1058,16 @@ void Compiler::optValnumCSE_InitDataFlow() while (lst != nullptr) { BasicBlock* block = lst->tslBlock; - unsigned CseAvailBit = genCSEnum2bit(CSEindex) * 2; - unsigned cseAvailCrossCallBit = CseAvailBit + 1; + unsigned cseAvailBit = getCSEAvailBit(CSEindex); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEindex); - // This CSE is generated in 'block', we always set the CseAvailBit + // This CSE is generated in 'block', we always set the cseAvailBit // If this block does not contain a call, we also set cseAvailCrossCallBit // // If we have a call in this block then in the loop below we walk the trees // backwards to find any CSEs that are generated after the last call in the block. // - BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, CseAvailBit); + BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailBit); if ((block->bbFlags & BBF_HAS_CALL) == 0) { BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); @@ -1113,7 +1111,7 @@ void Compiler::optValnumCSE_InitDataFlow() if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); } if (tree->OperGet() == GT_CALL) @@ -1142,15 +1140,16 @@ void Compiler::optValnumCSE_InitDataFlow() bool headerPrinted = false; for (BasicBlock* const block : Blocks()) { - if (block->bbCseGen != nullptr) + if (!BitVecOps::IsEmpty(cseLivenessTraits, block->bbCseGen)) { if (!headerPrinted) { printf("\nBlocks that generate CSE def/uses\n"); headerPrinted = true; } - printf(FMT_BB, block->bbNum); - printf(" cseGen = %s\n", genES2str(cseLivenessTraits, block->bbCseGen)); + printf(FMT_BB " cseGen = ", block->bbNum); + optPrintCSEDataFlowSet(block->bbCseGen); + printf("\n"); } } } @@ -1184,6 +1183,7 @@ class CSE_DataFlow // BitVecOps::Assign(m_comp->cseLivenessTraits, m_preMergeOut, block->bbCseOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1191,11 +1191,13 @@ class CSE_DataFlow printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); } #endif // DEBUG +#endif // 0 } // Merge: perform the merging of each of the predecessor's liveness values (since this is a forward analysis) void Merge(BasicBlock* block, BasicBlock* predBlock, unsigned dupCount) { +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1204,15 +1206,18 @@ class CSE_DataFlow printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); } #endif // DEBUG +#endif // 0 BitVecOps::IntersectionD(m_comp->cseLivenessTraits, block->bbCseIn, predBlock->bbCseOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { printf(" => cseIn = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseIn)); } #endif // DEBUG +#endif // 0 } //------------------------------------------------------------------------ @@ -1272,6 +1277,7 @@ class CSE_DataFlow // bool notDone = !BitVecOps::Equal(m_comp->cseLivenessTraits, block->bbCseOut, m_preMergeOut); +#if 0 #ifdef DEBUG if (m_comp->verbose) { @@ -1288,6 +1294,7 @@ class CSE_DataFlow notDone ? "true" : "false"); } #endif // DEBUG +#endif // 0 return notDone; } @@ -1330,10 +1337,12 @@ void Compiler::optValnumCSE_DataFlow() for (BasicBlock* const block : Blocks()) { - printf(FMT_BB, block->bbNum); - printf(" cseIn = %s,", genES2str(cseLivenessTraits, block->bbCseIn)); - printf(" cseGen = %s,", genES2str(cseLivenessTraits, block->bbCseGen)); - printf(" cseOut = %s", genES2str(cseLivenessTraits, block->bbCseOut)); + printf(FMT_BB " in gen out\n", block->bbNum); + optPrintCSEDataFlowSet(block->bbCseIn); + printf("\n"); + optPrintCSEDataFlowSet(block->bbCseGen); + printf("\n"); + optPrintCSEDataFlowSet(block->bbCseOut); printf("\n"); } @@ -1347,17 +1356,17 @@ void Compiler::optValnumCSE_DataFlow() // // Using the information computed by CSE_DataFlow determine for each // CSE whether the CSE is a definition (if the CSE was not available) -// or if the CSE is a use (if the CSE was previously made available) -// The implementation iterates of all blocks setting 'available_cses' +// or if the CSE is a use (if the CSE was previously made available). +// The implementation iterates over all blocks setting 'available_cses' // to the CSEs that are available at input to the block. // When a CSE expression is encountered it is classified as either // as a definition (if the CSE is not in the 'available_cses' set) or -// as a use (if the CSE is in the 'available_cses' set). If the CSE +// as a use (if the CSE is in the 'available_cses' set). If the CSE // is a definition then it is added to the 'available_cses' set. // // This algorithm uncovers the defs and uses gradually and as it does // so it also builds the exception set that all defs make: 'defExcSetCurrent' -// and the exception set that the uses we have seen depend upon: 'defExcSetPromise' +// and the exception set that the uses we have seen depend upon: 'defExcSetPromise'. // // Typically expressions with the same normal ValueNum generate exactly the // same exception sets. There are two way that we can get different exception @@ -1371,12 +1380,11 @@ void Compiler::optValnumCSE_DataFlow() // 2. We stored an expression into a LclVar or into Memory and read it later // e.g. t = p.a; // e1 = (t + q.b) :: e1 has one NullPtrExc and e2 has two. -// e2 = (p.a + q.b) but both compute the same normal value// +// e2 = (p.a + q.b) but both compute the same normal value // e.g. m.a = p.a; // e1 = (m.a + q.b) :: e1 and e2 have different exception sets. // e2 = (p.a + q.b) but both compute the same normal value // -// void Compiler::optValnumCSE_Availablity() { #ifdef DEBUG @@ -1411,12 +1419,12 @@ void Compiler::optValnumCSE_Availablity() if (IS_CSE_INDEX(tree->gtCSEnum)) { unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned CseAvailBit = genCSEnum2bit(CSEnum) * 2; - unsigned cseAvailCrossCallBit = CseAvailBit + 1; + unsigned cseAvailBit = getCSEAvailBit(CSEnum); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); CSEdsc* desc = optCSEfindDsc(CSEnum); BasicBlock::weight_t stmw = block->getBBWeight(this); - isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, CseAvailBit); + isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, cseAvailBit); isDef = !isUse; // If is isn't a CSE use, it is a CSE def // Is this a "use", that we haven't yet marked as live across a call @@ -1446,7 +1454,7 @@ void Compiler::optValnumCSE_Availablity() printf(FMT_BB " ", block->bbNum); printTreeID(tree); - printf(" %s of CSE #%02u [weight=%s]%s\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw), + printf(" %s of " FMT_CSE " [weight=%s]%s\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw), madeLiveAcrossCall ? " *** Now Live Across Call ***" : ""); } #endif // DEBUG @@ -1477,7 +1485,7 @@ void Compiler::optValnumCSE_Availablity() // Is defExcSetCurrent still set to the uninit marker value of VNForNull() ? if (desc->defExcSetCurrent == vnStore->VNForNull()) { - // This is the first time visited, so record this defs exeception set + // This is the first time visited, so record this defs exception set desc->defExcSetCurrent = theLiberalExcSet; } @@ -1589,7 +1597,7 @@ void Compiler::optValnumCSE_Availablity() tree->gtCSEnum = TO_CSE_DEF(tree->gtCSEnum); // This CSE becomes available after this def - BitVecOps::AddElemD(cseLivenessTraits, available_cses, CseAvailBit); + BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailBit); BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); } else // We are visiting a CSE use @@ -1636,7 +1644,7 @@ void Compiler::optValnumCSE_Availablity() if (!vnStore->VNExcIsSubset(desc->defExcSetPromise, theLiberalExcSet)) { // We can't safely make this into a CSE use, because this - // CSE use has an exeception set item that is not promised + // CSE use has an exception set item that is not promised // by all of our CSE defs. // // We will omit this CSE use from the graph and proceed, @@ -1660,7 +1668,7 @@ void Compiler::optValnumCSE_Availablity() // In order to determine if a CSE is live across a call, we model availablity using two bits and // kill all of the cseAvailCrossCallBit for each CSE whenever we see a GT_CALL (unless the call - // generates A cse) + // generates a CSE). // if (tree->OperGet() == GT_CALL) { @@ -1690,7 +1698,7 @@ void Compiler::optValnumCSE_Availablity() // available_cses // unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(CSEnum); BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); } @@ -2027,14 +2035,14 @@ class CSE_Heuristic if (!Compiler::Is_Shared_Const_CSE(dsc->csdHashKey)) { - printf("CSE #%02u, {$%-3x, $%-3x} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", + printf(FMT_CSE ", {$%-3x, $%-3x} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dsc->csdHashKey, dsc->defExcSetPromise, dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } else { size_t kVal = Compiler::Decode_Shared_Const_CSE_Value(dsc->csdHashKey); - printf("CSE #%02u, {K_%p} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, + printf(FMT_CSE ", {K_%p} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dspPtr(kVal), dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } @@ -2814,7 +2822,7 @@ class CSE_Heuristic if (dsc->csdDefCount == 1) { - JITDUMP("CSE #%02u is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex, + JITDUMP(FMT_CSE " is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex, cseLclVarNum); m_pCompiler->lvaTable[cseLclVarNum].lvInSsa = true; @@ -2931,7 +2939,7 @@ class CSE_Heuristic if (IS_CSE_INDEX(lst->tslTree->gtCSEnum)) { ValueNum currVN = m_pCompiler->vnStore->VNLiberalNormalValue(lst->tslTree->gtVNPair); - printf("0x%x(%s " FMT_VN ") ", lst->tslTree, + printf("[%06d](%s " FMT_VN ") ", m_pCompiler->dspTreeID(lst->tslTree), IS_CSE_USE(lst->tslTree->gtCSEnum) ? "use" : "def", currVN); } lst = lst->tslNext; @@ -2996,7 +3004,7 @@ class CSE_Heuristic #ifdef DEBUG if (m_pCompiler->verbose) { - printf("\nWorking on the replacement of the CSE #%02u use at ", exp->gtCSEnum); + printf("\nWorking on the replacement of the " FMT_CSE " use at ", exp->gtCSEnum); Compiler::printTreeID(exp); printf(" in " FMT_BB "\n", blk->bbNum); } @@ -3164,7 +3172,7 @@ class CSE_Heuristic #ifdef DEBUG if (m_pCompiler->verbose) { - printf("\nCSE #%02u def at ", GET_CSE_INDEX(exp->gtCSEnum)); + printf("\n" FMT_CSE " def at ", GET_CSE_INDEX(exp->gtCSEnum)); Compiler::printTreeID(exp); printf(" replaced in " FMT_BB " with def of V%02u\n", blk->bbNum, cseLclVarNum); } @@ -3314,13 +3322,13 @@ class CSE_Heuristic if (dsc->defExcSetPromise == ValueNumStore::NoVN) { - JITDUMP("Abandoned CSE #%02u because we had defs with different Exc sets\n", candidate.CseIndex()); + JITDUMP("Abandoned " FMT_CSE " because we had defs with different Exc sets\n", candidate.CseIndex()); continue; } if (dsc->csdStructHndMismatch) { - JITDUMP("Abandoned CSE #%02u because we had mismatching struct handles\n", candidate.CseIndex()); + JITDUMP("Abandoned " FMT_CSE " because we had mismatching struct handles\n", candidate.CseIndex()); continue; } @@ -3328,7 +3336,7 @@ class CSE_Heuristic if (candidate.UseCount() == 0) { - JITDUMP("Skipped CSE #%02u because use count is 0\n", candidate.CseIndex()); + JITDUMP("Skipped " FMT_CSE " because use count is 0\n", candidate.CseIndex()); continue; } @@ -3337,14 +3345,14 @@ class CSE_Heuristic { if (!Compiler::Is_Shared_Const_CSE(dsc->csdHashKey)) { - printf("\nConsidering CSE #%02u {$%-3x, $%-3x} [def=%3f, use=%3f, cost=%3u%s]\n", + printf("\nConsidering " FMT_CSE " {$%-3x, $%-3x} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), dsc->csdHashKey, dsc->defExcSetPromise, candidate.DefCount(), candidate.UseCount(), candidate.Cost(), dsc->csdLiveAcrossCall ? ", call" : " "); } else { size_t kVal = Compiler::Decode_Shared_Const_CSE_Value(dsc->csdHashKey); - printf("\nConsidering CSE #%02u {K_%p} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), + printf("\nConsidering " FMT_CSE " {K_%p} [def=%3f, use=%3f, cost=%3u%s]\n", candidate.CseIndex(), dspPtr(kVal), candidate.DefCount(), candidate.UseCount(), candidate.Cost(), dsc->csdLiveAcrossCall ? ", call" : " "); } @@ -3428,11 +3436,6 @@ void Compiler::optValnumCSE_Heuristic() void Compiler::optOptimizeValnumCSEs() { #ifdef DEBUG - if (verbose) - { - printf("\n*************** In optOptimizeValnumCSEs()\n"); - } - if (optConfigDisableCSE()) { return; // Disabled by JitNoCSE @@ -3441,22 +3444,13 @@ void Compiler::optOptimizeValnumCSEs() optValnumCSE_phase = true; - /* Initialize the expression tracking logic */ - optValnumCSE_Init(); - /* Locate interesting expressions and assign indices to them */ - - if (optValnumCSE_Locate() > 0) + if (optValnumCSE_Locate()) { - optCSECandidateTotal += optCSECandidateCount; - optValnumCSE_InitDataFlow(); - optValnumCSE_DataFlow(); - optValnumCSE_Availablity(); - optValnumCSE_Heuristic(); } @@ -3807,21 +3801,11 @@ bool Compiler::optConfigDisableCSE2() void Compiler::optOptimizeCSEs() { -#ifdef DEBUG - if (verbose) - { - printf("\n*************** In optOptimizeCSEs()\n"); - printf("Blocks/Trees at start of optOptimizeCSE phase\n"); - fgDispBasicBlocks(true); - } -#endif // DEBUG - optCSECandidateCount = 0; optCSEstart = lvaCount; INDEBUG(optEnsureClearCSEInfo()); optOptimizeValnumCSEs(); - EndPhase(PHASE_OPTIMIZE_VALNUM_CSES); } /***************************************************************************** @@ -3873,4 +3857,38 @@ void Compiler::optEnsureClearCSEInfo() } } +//------------------------------------------------------------------------ +// optPrintCSEDataFlowSet: Print out one of the CSE dataflow sets bbCseGen, bbCseIn, bbCseOut, +// interpreting the bits in a more useful way for the dump. +// +// Arguments: +// cseDataFlowSet - One of the dataflow sets to display +// includeBits - Display the actual bits of the set as well +// +void Compiler::optPrintCSEDataFlowSet(EXPSET_VALARG_TP cseDataFlowSet, bool includeBits /* = true */) +{ + if (includeBits) + { + printf("%s ", genES2str(cseLivenessTraits, cseDataFlowSet)); + } + + bool first = true; + for (unsigned cseIndex = 1; cseIndex <= optCSECandidateCount; cseIndex++) + { + unsigned cseAvailBit = getCSEAvailBit(cseIndex); + unsigned cseAvailCrossCallBit = getCSEAvailCrossCallBit(cseIndex); + + if (BitVecOps::IsMember(cseLivenessTraits, cseDataFlowSet, cseAvailBit)) + { + if (!first) + { + printf(", "); + } + const bool isAvailCrossCall = BitVecOps::IsMember(cseLivenessTraits, cseDataFlowSet, cseAvailCrossCallBit); + printf(FMT_CSE "%s", cseIndex, isAvailCrossCall ? ".c" : ""); + first = false; + } + } +} + #endif // DEBUG diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a7df5b95905a2..748d5feabb562 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -38,7 +38,6 @@ void Compiler::optInit() optNativeCallCount = 0; optAssertionCount = 0; optAssertionDep = nullptr; - optCSECandidateTotal = 0; optCSEstart = UINT_MAX; optCSEcount = 0; } @@ -5627,8 +5626,8 @@ void Compiler::optHoistLoopCode() #endif #if 0 - // The code in this #if has been useful in debugging loop cloning issues, by - // enabling selective enablement of the loop cloning optimization according to + // The code in this #if has been useful in debugging loop hoisting issues, by + // enabling selective enablement of the loop hoisting optimization according to // method hash. #ifdef DEBUG unsigned methHash = info.compMethodHash(); @@ -5650,7 +5649,7 @@ void Compiler::optHoistLoopCode() return; printf("Doing loop hoisting in %s (0x%x).\n", info.compFullName, methHash); #endif // DEBUG -#endif // 0 -- debugging loop cloning issues +#endif // 0 -- debugging loop hoisting issues #ifdef DEBUG if (verbose) @@ -5899,6 +5898,8 @@ void Compiler::optHoistThisLoop(unsigned lnum, LoopHoistContext* hoistCtxt) printf("\n LOOPV-FP(%d)=", pLoopDsc->lpLoopVarFPCount); lvaDispVarSet(loopFPVars); + + printf("\n"); } #endif } diff --git a/src/coreclr/jit/phase.cpp b/src/coreclr/jit/phase.cpp index 530f8e74cbba0..f08134bf11532 100644 --- a/src/coreclr/jit/phase.cpp +++ b/src/coreclr/jit/phase.cpp @@ -84,8 +84,9 @@ void Phase::PrePhase() // // Currently the list is just the set of phases that have custom // derivations from the Phase class. - static Phases s_allowlist[] = {PHASE_BUILD_SSA, PHASE_RATIONALIZE, PHASE_LOWERING, PHASE_STACK_LEVEL_SETTER}; - bool doPrePhase = false; + static Phases s_allowlist[] = {PHASE_BUILD_SSA, PHASE_OPTIMIZE_VALNUM_CSES, PHASE_RATIONALIZE, PHASE_LOWERING, + PHASE_STACK_LEVEL_SETTER}; + bool doPrePhase = false; for (size_t i = 0; i < sizeof(s_allowlist) / sizeof(Phases); i++) {