Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enregister EH var that are single def #47307

Merged
merged 14 commits into from
Mar 9, 2021
23 changes: 23 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ class LclVarDsc
// before lvaMarkLocalVars: identifies ref type locals that can get type updates
// after lvaMarkLocalVars: identifies locals that are suitable for optAddCopies

unsigned char lvEhWriteThruCandidate : 1; // variable has a single def and hence is a register candidate if
// if it is an EH variable

unsigned char lvDisqualifyForEhWriteThru : 1; // tracks variable that are disqualified from register candidancy

#if ASSERTION_PROP
unsigned char lvDisqualify : 1; // variable is no longer OK for add copy optimization
unsigned char lvVolatileHint : 1; // hint for AssertionProp
Expand Down Expand Up @@ -7369,6 +7374,24 @@ class Compiler

void raMarkStkVars();

#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
#if defined(TARGET_AMD64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
return (type == TYP_SIMD32);
}
#elif defined(TARGET_ARM64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
// ARM64 ABI FP Callee save registers only require Callee to save lower 8 Bytes
// For SIMD types longer than 8 bytes Caller is responsible for saving and restoring Upper bytes.
return ((type == TYP_SIMD16) || (type == TYP_SIMD12));
}
#else // !defined(TARGET_AMD64) && !defined(TARGET_ARM64)
#error("Unknown target architecture for FEATURE_SIMD")
#endif // !defined(TARGET_AMD64) && !defined(TARGET_ARM64)
#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE

protected:
// Some things are used by both LSRA and regpredict allocators.

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/jitconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ CONFIG_INTEGER(EnablePOPCNT, W("EnablePOPCNT"), 1) // Enable POPCNT
CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)
#endif // !defined(TARGET_AMD64) && !defined(TARGET_X86)

CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 0) // Enable the register allocator to support EH-write thru:
CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 1) // Enable the register allocator to support EH-write thru:
// partial enregistration of vars exposed on EH boundaries
CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are
// defined or used in a multireg context.
Expand Down
47 changes: 43 additions & 4 deletions src/coreclr/jit/lclvars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2614,14 +2614,16 @@ void Compiler::lvaSetVarLiveInOutOfHandler(unsigned varNum)
{
noway_assert(lvaTable[i].lvIsStructField);
lvaTable[i].lvLiveInOutOfHndlr = 1;
if (!lvaEnregEHVars)
// For now, only enregister an EH Var if it is a single def and whose refCnt > 1.
if (!lvaEnregEHVars || !lvaTable[i].lvEhWriteThruCandidate || lvaTable[i].lvRefCnt() <= 1)
{
lvaSetVarDoNotEnregister(i DEBUGARG(DNER_LiveInOutOfHandler));
}
}
}

if (!lvaEnregEHVars)
// For now, only enregister an EH Var if it is a single def and whose refCnt > 1.
if (!lvaEnregEHVars || !varDsc->lvEhWriteThruCandidate || varDsc->lvRefCnt() <= 1)
{
lvaSetVarDoNotEnregister(varNum DEBUGARG(DNER_LiveInOutOfHandler));
}
Expand Down Expand Up @@ -4075,6 +4077,35 @@ void Compiler::lvaMarkLclRefs(GenTree* tree, BasicBlock* block, Statement* stmt,
BlockSetOps::AddElemD(this, varDsc->lvRefBlks, block->bbNum);
}
}

if (!varDsc->lvDisqualifyForEhWriteThru)
{
if (tree->gtFlags & GTF_VAR_DEF)
{
bool bbInALoop = (block->bbFlags & BBF_BACKWARD_JUMP) != 0;
bool bbIsReturn = block->bbJumpKind == BBJ_RETURN;
bool needsExplicitZeroInit = fgVarNeedsExplicitZeroInit(lclNum, bbInALoop, bbIsReturn);

if ((varDsc->lvEhWriteThruCandidate == true) || (needsExplicitZeroInit == true) ||
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
(tree->gtFlags & GTF_COLON_COND) || (tree->gtFlags & GTF_VAR_USEASG))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add comment on why these cause exclusion?

USEASG I can guess, at though it seems it would be redundant with restricting to single def?

COLON_COND is a mystery to me, why does it matter? It goes away during morph.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I just copied the conditions of USEASG and COLON_COND from lvSingleDef logic above. Yes, I don't think we need it specially because we already check for varDsc->lvEhWriteThruCandidate.

{
varDsc->lvEhWriteThruCandidate = false;
varDsc->lvDisqualifyForEhWriteThru = true;
}
else
{
#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
// TODO-CQ: If the varType needs partial callee save, conservatively do not enregister
// such variable. In future, need to enable enregisteration for such variables.
if (!varTypeNeedsPartialCalleeSave(varDsc->lvType))
#endif
{
varDsc->lvEhWriteThruCandidate = true;
}
}
}
}

#endif // ASSERTION_PROP

bool allowStructs = false;
Expand Down Expand Up @@ -4178,6 +4209,8 @@ void Compiler::lvaMarkLocalVars(BasicBlock* block, bool isRecompute)

Compiler::fgWalkResult PreOrderVisit(GenTree** use, GenTree* user)
{
// TODO: Stop passing isRecompute once we are sure that this assert is never hit.
assert(!m_isRecompute);
m_compiler->lvaMarkLclRefs(*use, m_block, m_stmt, m_isRecompute);
return WALK_CONTINUE;
}
Expand Down Expand Up @@ -4437,7 +4470,13 @@ void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers)

// Set initial value for lvSingleDef for explicit and implicit
// argument locals as they are "defined" on entry.
varDsc->lvSingleDef = varDsc->lvIsParam;
// However, if we are just recomputing the ref counts, retain the value
// that was set by past phases.
if (!isRecompute)
{
varDsc->lvSingleDef = varDsc->lvIsParam;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite understand this -- if we dead store something we can turn it from multiple def to single def. I suppose that won't happen for live into handler vars but it can for other vars.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose that won't happen for live into handler vars but it can for other vars.

I am not 100% sure if that is the case.

Currently, in the PR, I am marking lvEhWriteThruCandidate in one of the early phase of lvaMarkLocalVars(). I have to do it because that information is used during SSABuilder/Lowering to determine if a varDsc should be enregistered or not. The weird part of this logic for lvaSetVarLiveInOutOfHandler() is that during SSABuilder, we might mark a varDsc->lvDoNotEnregister=1 but later during Lowering, if things change and we want to reset lvDoNotEnregister to enregister that EH variable, currently we don't do that. That should be fixed IMO.

As per your suggestion, I should definitely take into account dead store, etc. and defer the setting of lvEhWriteThruCandidate until I recompute the ref count during Lowering, but it would be too late and we might have already marked the varDsc to be "do not enregister".

So perhaps, there might be some refactoring involved to accomplish things to happen in right order and I might want to do it in a separate PR. Other thing that I want to do in the follow-up PR is to support local fields, because those are skipped in this PR from getting marked as lvEhWriteThruCandidate. Perhaps, the logic of setting lvEhWriteThruCandidate should ideally be around this code.

varDsc->lvEhWriteThruCandidate = varDsc->lvIsParam;
}
}

// Remember current state of generic context use, and prepare
Expand Down Expand Up @@ -7194,7 +7233,7 @@ void Compiler::lvaDumpFrameLocation(unsigned lclNum)
baseReg = EBPbased ? REG_FPBASE : REG_SPBASE;
#endif

printf("[%2s%1s0x%02X] ", getRegName(baseReg), (offset < 0 ? "-" : "+"), (offset < 0 ? -offset : offset));
printf("[%2s%1s%02XH] ", getRegName(baseReg), (offset < 0 ? "-" : "+"), (offset < 0 ? -offset : offset));
}

/*****************************************************************************
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/jit/lsra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1781,7 +1781,7 @@ void LinearScan::identifyCandidates()

if (varDsc->lvLiveInOutOfHndlr)
{
newInt->isWriteThru = true;
newInt->isWriteThru = varDsc->lvEhWriteThruCandidate;
setIntervalAsSpilled(newInt);
}

Expand All @@ -1796,7 +1796,7 @@ void LinearScan::identifyCandidates()
// Additionally, when we are generating code for a target with partial SIMD callee-save
// (AVX on non-UNIX amd64 and 16-byte vectors on arm64), we keep a separate set of the
// LargeVectorType vars.
if (varTypeNeedsPartialCalleeSave(varDsc->lvType))
if (Compiler::varTypeNeedsPartialCalleeSave(varDsc->lvType))
{
largeVectorVarCount++;
VarSetOps::AddElemD(compiler, largeVectorVars, varDsc->lvVarIndex);
Expand Down Expand Up @@ -5050,7 +5050,7 @@ void LinearScan::processBlockEndLocations(BasicBlock* currentBlock)
}
#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
// Ensure that we have no partially-spilled large vector locals.
assert(!varTypeNeedsPartialCalleeSave(interval->registerType) || !interval->isPartiallySpilled);
assert(!Compiler::varTypeNeedsPartialCalleeSave(interval->registerType) || !interval->isPartiallySpilled);
#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
}
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_END_BB));
Expand Down Expand Up @@ -6922,7 +6922,7 @@ void LinearScan::insertUpperVectorSave(GenTree* tree,
}

LclVarDsc* varDsc = compiler->lvaTable + lclVarInterval->varNum;
assert(varTypeNeedsPartialCalleeSave(varDsc->lvType));
assert(Compiler::varTypeNeedsPartialCalleeSave(varDsc->lvType));

// On Arm64, we must always have a register to save the upper half,
// while on x86 we can spill directly to memory.
Expand Down Expand Up @@ -7003,7 +7003,7 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree,
// lclVar as spilled).
assert(lclVarReg != REG_NA);
LclVarDsc* varDsc = compiler->lvaTable + lclVarInterval->varNum;
assert(varTypeNeedsPartialCalleeSave(varDsc->lvType));
assert(Compiler::varTypeNeedsPartialCalleeSave(varDsc->lvType));

GenTree* restoreLcl = nullptr;
restoreLcl = compiler->gtNewLclvNode(lclVarInterval->varNum, varDsc->lvType);
Expand Down
17 changes: 2 additions & 15 deletions src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ class LinearScan : public LinearScanInterface

void resolveConflictingDefAndUse(Interval* interval, RefPosition* defRefPosition);

void buildRefPositionsForNode(GenTree* tree, BasicBlock* block, LsraLocation loc);
void buildRefPositionsForNode(GenTree* tree, LsraLocation loc);

#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
void buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation currentLoc, regMaskTP fpCalleeKillSet);
Expand Down Expand Up @@ -1500,23 +1500,10 @@ class LinearScan : public LinearScanInterface

#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
#if defined(TARGET_AMD64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
return (type == TYP_SIMD32);
}
static const var_types LargeVectorSaveType = TYP_SIMD16;
#elif defined(TARGET_ARM64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
// ARM64 ABI FP Callee save registers only require Callee to save lower 8 Bytes
// For SIMD types longer than 8 bytes Caller is responsible for saving and restoring Upper bytes.
return ((type == TYP_SIMD16) || (type == TYP_SIMD12));
}
static const var_types LargeVectorSaveType = TYP_DOUBLE;
#else // !defined(TARGET_AMD64) && !defined(TARGET_ARM64)
#error("Unknown target architecture for FEATURE_SIMD")
static const var_types LargeVectorSaveType = TYP_DOUBLE;
#endif // !defined(TARGET_AMD64) && !defined(TARGET_ARM64)

// Set of large vector (TYP_SIMD32 on AVX) variables.
VARSET_TP largeVectorVars;
// Set of large vector (TYP_SIMD32 on AVX) variables to consider for callee-save registers.
Expand Down
13 changes: 6 additions & 7 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,7 +1160,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo
{
LclVarDsc* varDsc = compiler->lvaGetDescByTrackedIndex(varIndex);
#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
if (varTypeNeedsPartialCalleeSave(varDsc->lvType))
if (Compiler::varTypeNeedsPartialCalleeSave(varDsc->lvType))
{
if (!VarSetOps::IsMember(compiler, largeVectorCalleeSaveCandidateVars, varIndex))
{
Expand Down Expand Up @@ -1424,7 +1424,7 @@ void LinearScan::buildInternalRegisterUses()
void LinearScan::makeUpperVectorInterval(unsigned varIndex)
{
Interval* lclVarInterval = getIntervalForLocalVar(varIndex);
assert(varTypeNeedsPartialCalleeSave(lclVarInterval->registerType));
assert(Compiler::varTypeNeedsPartialCalleeSave(lclVarInterval->registerType));
Interval* newInt = newInterval(LargeVectorSaveType);
newInt->relatedInterval = lclVarInterval;
newInt->isUpperVector = true;
Expand Down Expand Up @@ -1506,7 +1506,7 @@ void LinearScan::buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation cu
for (RefInfoListNode *listNode = defList.Begin(), *end = defList.End(); listNode != end;
listNode = listNode->Next())
{
if (varTypeNeedsPartialCalleeSave(listNode->treeNode->TypeGet()))
if (Compiler::varTypeNeedsPartialCalleeSave(listNode->treeNode->TypeGet()))
{
// In the rare case where such an interval is live across nested calls, we don't need to insert another.
if (listNode->ref->getInterval()->recentRefPosition->refType != RefTypeUpperVectorSave)
Expand Down Expand Up @@ -1637,10 +1637,9 @@ int LinearScan::ComputeAvailableSrcCount(GenTree* node)
//
// Arguments:
// tree - The node for which we are building RefPositions
// block - The BasicBlock in which the node resides
// currentLoc - The LsraLocation of the given node
//
void LinearScan::buildRefPositionsForNode(GenTree* tree, BasicBlock* block, LsraLocation currentLoc)
void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc)
{
// The LIR traversal doesn't visit GT_LIST or GT_ARGPLACE nodes.
// GT_CLS_VAR nodes should have been eliminated by rationalizer.
Expand Down Expand Up @@ -2351,7 +2350,7 @@ void LinearScan::buildIntervals()
node->SetRegNum(node->GetRegNum());
#endif

buildRefPositionsForNode(node, block, currentLoc);
buildRefPositionsForNode(node, currentLoc);

#ifdef DEBUG
if (currentLoc > maxNodeLocation)
Expand Down Expand Up @@ -3232,7 +3231,7 @@ void LinearScan::BuildStoreLocDef(GenTreeLclVarCommon* storeLoc,
def->regOptional = true;
}
#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
if (varTypeNeedsPartialCalleeSave(varDefInterval->registerType))
if (Compiler::varTypeNeedsPartialCalleeSave(varDefInterval->registerType))
{
varDefInterval->isPartiallySpilled = false;
}
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/scripts/superpmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,7 +1346,7 @@ def save_repro_mc_files(temp_location, coreclr_args, repro_base_command_line):
shutil.copy2(item, repro_location)

logging.info("")
logging.info("Repro .mc files created for failures:")
logging.info("Repro {} .mc file(s) created for failures:".format(len(repro_files)))
for item in repro_files:
logging.info(item)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,39 +25,31 @@ internal static class AsyncMethodBuilderCore // debugger depends on this exact n
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.stateMachine);
}

// enregistrer variables with 0 post-fix so they can be used in registers without EH forcing them to stack
// Capture references to Thread Contexts
Thread currentThread0 = Thread.CurrentThread;
Thread currentThread = currentThread0;
ExecutionContext? previousExecutionCtx0 = currentThread0._executionContext;
Thread currentThread = Thread.CurrentThread;

// Store current ExecutionContext and SynchronizationContext as "previousXxx".
// This allows us to restore them and undo any Context changes made in stateMachine.MoveNext
// so that they won't "leak" out of the first await.
ExecutionContext? previousExecutionCtx = previousExecutionCtx0;
SynchronizationContext? previousSyncCtx = currentThread0._synchronizationContext;
ExecutionContext? previousExecutionCtx = currentThread._executionContext;
SynchronizationContext? previousSyncCtx = currentThread._synchronizationContext;

try
{
stateMachine.MoveNext();
}
finally
{
// Re-enregistrer variables post EH with 1 post-fix so they can be used in registers rather than from stack
SynchronizationContext? previousSyncCtx1 = previousSyncCtx;
Thread currentThread1 = currentThread;
// The common case is that these have not changed, so avoid the cost of a write barrier if not needed.
if (previousSyncCtx1 != currentThread1._synchronizationContext)
if (previousSyncCtx != currentThread._synchronizationContext)
{
// Restore changed SynchronizationContext back to previous
currentThread1._synchronizationContext = previousSyncCtx1;
currentThread._synchronizationContext = previousSyncCtx;
}

ExecutionContext? previousExecutionCtx1 = previousExecutionCtx;
ExecutionContext? currentExecutionCtx1 = currentThread1._executionContext;
if (previousExecutionCtx1 != currentExecutionCtx1)
ExecutionContext? currentExecutionCtx = currentThread._executionContext;
if (previousExecutionCtx != currentExecutionCtx)
{
ExecutionContext.RestoreChangedContextToThread(currentThread1, previousExecutionCtx1, currentExecutionCtx1);
ExecutionContext.RestoreChangedContextToThread(currentThread, previousExecutionCtx, currentExecutionCtx);
}
}
}
Expand Down