Skip to content

Commit

Permalink
Added PerfScore support for Arm64
Browse files Browse the repository at this point in the history
Based upon arm_cortex_a55_software_optimization_guide_v2.pdf
  • Loading branch information
briansull committed Dec 12, 2019
1 parent 96d9a47 commit c3dacce
Show file tree
Hide file tree
Showing 5 changed files with 952 additions and 45 deletions.
30 changes: 29 additions & 1 deletion src/coreclr/src/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ float emitter::insEvaluateExecutionCost(instrDesc* id)

if (memAccessKind == PERFSCORE_MEMORY_WRITE)
{
// We assume that we won't read back from memory for the next WR_GENERAL (3) cycles
// We assume that we won't read back from memory for the next WR_GENERAL cycles
// Thus we normally won't pay latency costs for writes.
latency = max(0.0f, latency - PERFSCORE_LATENCY_WR_GENERAL);
}
Expand All @@ -1121,6 +1121,34 @@ float emitter::insEvaluateExecutionCost(instrDesc* id)
return max(throughput, latency);
}

//------------------------------------------------------------------------------------
// perfScoreUnhandledInstruction:
// Helper method used to report an unhandled instruction
//
// Arguments:
// id - The current instruction descriptor to be evaluated
// pResult - pointer to struct holding the instruction characteristics
// if we return these are updated with default values
//
// Notes:
// When validating that the PerfScore handles every instruction.
// the #if 0 block is changed into a #ifdef DEBUG
// We will print the instruction and instruction group
// and instead of returning we will assert
//
// Otherwise we will return default latencies of 1 cycle.
//
void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult)
{
// Change this to #ifdef DEBUG to assert on any unhandled instructions
#if 0
printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), emitIfName(id->idInsFmt()));
assert(!"PerfScore: unhandled instruction");
#endif
pResult->insThroughput = PERFSCORE_THROUGHPUT_1C;
pResult->insLatency = PERFSCORE_LATENCY_1C;
}

#endif // defined(DEBUG) || defined(LATE_DISASM)

//----------------------------------------------------------------------------------------
Expand Down
60 changes: 44 additions & 16 deletions src/coreclr/src/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1233,19 +1233,17 @@ class emitter
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles

#define PERFSCORE_THROUGHPUT_DEFAULT PERFSCORE_THROUGHPUT_1C

#define PERFSCORE_LATENCY_ILLEGAL -1024.0f

#define PERFSCORE_LATENCY_DEFAULT 1.0f

#define PERFSCORE_LATENCY_ZERO 0.0f
#define PERFSCORE_LATENCY_1C 1.0f
#define PERFSCORE_LATENCY_2C 2.0f
Expand All @@ -1260,7 +1258,11 @@ class emitter
#define PERFSCORE_LATENCY_11C 11.0f
#define PERFSCORE_LATENCY_12C 12.0f
#define PERFSCORE_LATENCY_13C 13.0f
#define PERFSCORE_LATENCY_15C 15.0f
#define PERFSCORE_LATENCY_16C 16.0f
#define PERFSCORE_LATENCY_18C 18.0f
#define PERFSCORE_LATENCY_20C 20.0f
#define PERFSCORE_LATENCY_22C 22.0f
#define PERFSCORE_LATENCY_23C 23.0f
#define PERFSCORE_LATENCY_26C 26.0f
#define PERFSCORE_LATENCY_62C 62.0f
Expand All @@ -1271,22 +1273,46 @@ class emitter
#define PERFSCORE_LATENCY_BRANCH_COND 2.0f // includes cost of a possible misprediction
#define PERFSCORE_LATENCY_BRANCH_INDIRECT 2.0f // includes cost of a possible misprediction

#if defined(_TARGET_XARCH_)

// a read,write or modify from stack location, possible def to use latency from L0 cache
#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C
#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C
#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_5C

// a read, write or modify from constant location, possible def to use latency from L0 cache
#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_2C
#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_2C
#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_5C

// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
// plus an extra cost (of 1.0) for a increased chance of a cache miss
#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_3C
#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_3C
#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_6C

#elif defined(_TARGET_ARM64_) || defined(_TARGET_ARM_)

// a read,write or modify from stack location, possible def to use latency from L0 cache
#define PERFSCORE_LATENCY_RD_STACK 2.0f
#define PERFSCORE_LATENCY_WR_STACK 2.0f
#define PERFSCORE_LATENCY_RD_WR_STACK 5.0f
#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_3C
#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_1C
#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_3C

// a read, write or modify from constant location, possible def to use latency from L0 cache
#define PERFSCORE_LATENCY_RD_CONST_ADDR 2.0f
#define PERFSCORE_LATENCY_WR_CONST_ADDR 2.0f
#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR 5.0f
#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_3C
#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_1C
#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_3C

// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
// plus an extra cost (of 1.0) for a increased chance of a cache miss
#define PERFSCORE_LATENCY_RD_GENERAL 3.0f
#define PERFSCORE_LATENCY_WR_GENERAL 3.0f
#define PERFSCORE_LATENCY_RD_WR_GENERAL 6.0f
#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_4C
#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_1C
#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_4C

#endif // _TARGET_XXX

// Make this an enum:
//
#define PERFSCORE_MEMORY_NONE 0
#define PERFSCORE_MEMORY_READ 1
#define PERFSCORE_MEMORY_WRITE 2
Expand All @@ -1295,8 +1321,7 @@ class emitter
#define PERFSCORE_CODESIZE_COST_HOT 0.10f
#define PERFSCORE_CODESIZE_COST_COLD 0.01f

#define PERFSCORE_CALLEE_SPILL_COST \
0.75f // heuristicly derived - actual cost is one push and one pop, in the prolog/epilog
#define PERFSCORE_CALLEE_SPILL_COST 0.75f

struct insExecutionCharacteristics
{
Expand All @@ -1305,9 +1330,12 @@ class emitter
unsigned insMemoryAccessKind;
};

insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id);
float insEvaluateExecutionCost(instrDesc* id);

insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id);

void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* result);

#endif // defined(DEBUG) || defined(LATE_DISASM)

BasicBlock::weight_t getCurrentBlockWeight();
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/src/jit/emitarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7991,8 +7991,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins

// ToDo: Calculate actual throughput and latency values
//
result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
result.insLatency = PERFSCORE_LATENCY_DEFAULT;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = PERFSCORE_LATENCY_1C;

return result;
}
Expand Down
Loading

0 comments on commit c3dacce

Please sign in to comment.