diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp index 32b4a772a4a66..2e3c44448b374 100644 --- a/src/coreclr/src/jit/emit.cpp +++ b/src/coreclr/src/jit/emit.cpp @@ -1107,7 +1107,7 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) if (memAccessKind == PERFSCORE_MEMORY_WRITE) { - // We assume that we won't read back from memory for the next WR_GENERAL (3) cycles + // We assume that we won't read back from memory for the next WR_GENERAL cycles // Thus we normally won't pay latency costs for writes. latency = max(0.0f, latency - PERFSCORE_LATENCY_WR_GENERAL); } @@ -1121,6 +1121,34 @@ float emitter::insEvaluateExecutionCost(instrDesc* id) return max(throughput, latency); } +//------------------------------------------------------------------------------------ +// perfScoreUnhandledInstruction: +// Helper method used to report an unhandled instruction +// +// Arguments: +// id - The current instruction descriptor to be evaluated +// pResult - pointer to struct holding the instruction characteristics +// if we return these are updated with default values +// +// Notes: +// When validating that the PerfScore handles every instruction. +// the #if 0 block is changed into a #ifdef DEBUG +// We will print the instruction and instruction group +// and instead of returning we will assert +// +// Otherwise we will return default latencies of 1 cycle. +// +void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult) +{ +// Change this to #ifdef DEBUG to assert on any unhandled instructions +#if 0 + printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), emitIfName(id->idInsFmt())); + assert(!"PerfScore: unhandled instruction"); +#endif + pResult->insThroughput = PERFSCORE_THROUGHPUT_1C; + pResult->insLatency = PERFSCORE_LATENCY_1C; +} + #endif // defined(DEBUG) || defined(LATE_DISASM) //---------------------------------------------------------------------------------------- diff --git a/src/coreclr/src/jit/emit.h b/src/coreclr/src/jit/emit.h index 1bd579b2649fc..78e34087e1fea 100644 --- a/src/coreclr/src/jit/emit.h +++ b/src/coreclr/src/jit/emit.h @@ -1233,19 +1233,17 @@ class emitter #define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles #define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles #define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles +#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles +#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles #define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles #define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles #define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles #define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles -#define PERFSCORE_THROUGHPUT_DEFAULT PERFSCORE_THROUGHPUT_1C - #define PERFSCORE_LATENCY_ILLEGAL -1024.0f -#define PERFSCORE_LATENCY_DEFAULT 1.0f - #define PERFSCORE_LATENCY_ZERO 0.0f #define PERFSCORE_LATENCY_1C 1.0f #define PERFSCORE_LATENCY_2C 2.0f @@ -1260,7 +1258,11 @@ class emitter #define PERFSCORE_LATENCY_11C 11.0f #define PERFSCORE_LATENCY_12C 12.0f #define PERFSCORE_LATENCY_13C 13.0f +#define PERFSCORE_LATENCY_15C 15.0f #define PERFSCORE_LATENCY_16C 16.0f +#define PERFSCORE_LATENCY_18C 18.0f +#define PERFSCORE_LATENCY_20C 20.0f +#define PERFSCORE_LATENCY_22C 22.0f #define PERFSCORE_LATENCY_23C 23.0f #define PERFSCORE_LATENCY_26C 26.0f #define PERFSCORE_LATENCY_62C 62.0f @@ -1271,22 +1273,46 @@ class emitter #define PERFSCORE_LATENCY_BRANCH_COND 2.0f // includes cost of a possible misprediction #define PERFSCORE_LATENCY_BRANCH_INDIRECT 2.0f // includes cost of a possible misprediction +#if defined(_TARGET_XARCH_) + +// a read,write or modify from stack location, possible def to use latency from L0 cache +#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C +#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C +#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_5C + +// a read, write or modify from constant location, possible def to use latency from L0 cache +#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_2C +#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_2C +#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_5C + +// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache +// plus an extra cost (of 1.0) for a increased chance of a cache miss +#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_3C +#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_3C +#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_6C + +#elif defined(_TARGET_ARM64_) || defined(_TARGET_ARM_) + // a read,write or modify from stack location, possible def to use latency from L0 cache -#define PERFSCORE_LATENCY_RD_STACK 2.0f -#define PERFSCORE_LATENCY_WR_STACK 2.0f -#define PERFSCORE_LATENCY_RD_WR_STACK 5.0f +#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_3C +#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_1C +#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_3C // a read, write or modify from constant location, possible def to use latency from L0 cache -#define PERFSCORE_LATENCY_RD_CONST_ADDR 2.0f -#define PERFSCORE_LATENCY_WR_CONST_ADDR 2.0f -#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR 5.0f +#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_3C +#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_1C +#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_3C // a read, write or modify from memory location, possible def to use latency from L0 or L1 cache // plus an extra cost (of 1.0) for a increased chance of a cache miss -#define PERFSCORE_LATENCY_RD_GENERAL 3.0f -#define PERFSCORE_LATENCY_WR_GENERAL 3.0f -#define PERFSCORE_LATENCY_RD_WR_GENERAL 6.0f +#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_4C +#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_1C +#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_4C +#endif // _TARGET_XXX + +// Make this an enum: +// #define PERFSCORE_MEMORY_NONE 0 #define PERFSCORE_MEMORY_READ 1 #define PERFSCORE_MEMORY_WRITE 2 @@ -1295,8 +1321,7 @@ class emitter #define PERFSCORE_CODESIZE_COST_HOT 0.10f #define PERFSCORE_CODESIZE_COST_COLD 0.01f -#define PERFSCORE_CALLEE_SPILL_COST \ - 0.75f // heuristicly derived - actual cost is one push and one pop, in the prolog/epilog +#define PERFSCORE_CALLEE_SPILL_COST 0.75f struct insExecutionCharacteristics { @@ -1305,9 +1330,12 @@ class emitter unsigned insMemoryAccessKind; }; - insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id); float insEvaluateExecutionCost(instrDesc* id); + insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id); + + void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* result); + #endif // defined(DEBUG) || defined(LATE_DISASM) BasicBlock::weight_t getCurrentBlockWeight(); diff --git a/src/coreclr/src/jit/emitarm.cpp b/src/coreclr/src/jit/emitarm.cpp index 69af7d488a07d..05964694b02d6 100644 --- a/src/coreclr/src/jit/emitarm.cpp +++ b/src/coreclr/src/jit/emitarm.cpp @@ -7991,8 +7991,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // ToDo: Calculate actual throughput and latency values // - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; - result.insLatency = PERFSCORE_LATENCY_DEFAULT; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; return result; } diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 427a1c0ece6af..ad88954a25036 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -12195,6 +12195,11 @@ void emitter::getMemoryOperation(instrDesc* id, unsigned* pMemAccessKind, bool* // A struct containing the current instruction execution characteristics // // Notes: +// The instruction latencies and throughput values returned by this function +// are from +// +// The Arm Cortex-A55 Software Optimization Guide: +// https://static.docs.arm.com/epm128372/20/arm_cortex_a55_software_optimization_guide_v2.pdf // emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(instrDesc* id) { @@ -12209,25 +12214,878 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL; result.insLatency = PERFSCORE_LATENCY_ILLEGAL; + // Initialize insLatency based upon the instruction's memAccessKind and local access values + // if (memAccessKind == PERFSCORE_MEMORY_READ) { - result.insLatency = PERFSCORE_LATENCY_4C; + result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_RD_STACK : PERFSCORE_LATENCY_RD_GENERAL; } else if (memAccessKind == PERFSCORE_MEMORY_WRITE) { - result.insLatency = PERFSCORE_LATENCY_1C; + result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_WR_STACK : PERFSCORE_LATENCY_WR_GENERAL; } else if (memAccessKind == PERFSCORE_MEMORY_READ_WRITE) { - result.insLatency = PERFSCORE_LATENCY_5C; + result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_RD_WR_STACK : PERFSCORE_LATENCY_RD_WR_GENERAL; } - // ToDo: Determine individual instruction throughput as latency - // - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; - if (memAccessKind == PERFSCORE_MEMORY_NONE) + switch (insFmt) { - result.insLatency = PERFSCORE_LATENCY_DEFAULT; + // + // Branch Instructions + // + + case IF_BI_0A: // b, bl_local + case IF_BI_0C: // bl, b_tail + result.insThroughput = PERFSCORE_THROUGHPUT_1C; // but is Dual Issue + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_BI_0B: // beq, bne, bge, blt, bgt, ble, ... + case IF_BI_1A: // cbz, cbnz + case IF_BI_1B: // tbz, tbnz + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_BR_1B: // blr, br_tail + if (ins == INS_blr) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + } + // otherwise we should have a br_tail instruction + assert(ins == INS_br_tail); + __fallthrough; + case IF_BR_1A: // ret, br + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + // + // Arithmetic and logical instructions + // + + // ALU, basic + case IF_DR_3A: // add, adds, adc, adcs, and, ands, bic, bics, + // eon, eor, orn, orr, sub, subs, sbc, sbcs + // asr, asrv, lsl, lslv, lsr, lsrv, ror, rorv + // sdiv, udiv, mul, smull, smulh, umull, umulh, mneg + case IF_DR_2A: // cmp, cmn, tst + + switch (ins) + { + case INS_mul: + case INS_smull: + case INS_umull: + case INS_mneg: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_smulh: + case INS_umulh: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_6C; + break; + + case INS_sdiv: + case INS_udiv: + if (id->idOpSize() == EA_4BYTE) + { + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_12C; + break; + } + else + { + assert(id->idOpSize() == EA_8BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_20C; + break; + } + + case INS_add: + case INS_adds: + case INS_adc: + case INS_adcs: + case INS_and: + case INS_ands: + case INS_bic: + case INS_bics: + case INS_eon: + case INS_eor: + case INS_orn: + case INS_orr: + case INS_sub: + case INS_subs: + case INS_sbc: + case INS_sbcs: + case INS_asr: + case INS_lsl: + case INS_lsr: + case INS_ror: + case INS_cmp: + case INS_cmn: + case INS_tst: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_asrv: + case INS_lslv: + case INS_lsrv: + case INS_rorv: + // variable shift by register + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + + // ALU, basic immediate + case IF_DI_1A: // cmp, cmn + case IF_DI_1C: // tst + case IF_DI_1D: // mov reg, imm(N,r,s) + case IF_DI_1F: // ccmp, ccmn + case IF_DI_2A: // add, adds, suv, subs + case IF_DI_2C: // and, ands, eor, orr + + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DR_2D: // cinc, cinv, cneg + case IF_DR_2E: // mov, neg, mvn, negs + + case IF_DI_1E: // adr, adrp + case IF_DI_1B: // mov, movk, movn, movz + + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + // ALU, shift by immediate + case IF_DR_3B: // add, adds, and, ands, bic, bics, + // eon, eor, orn, orr, sub, subs + case IF_DR_2B: // cmp, cmn, tst + case IF_DR_2F: // neg, negs, mvn + case IF_DI_2B: // ror + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + // ALU, extend, scale + case IF_DR_3C: // add, adc, and, bic, eon, eor, orn, orr, sub, sbc + case IF_DR_2C: // cmp + case IF_DR_2J: // sha1h + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + // ALU, Conditional select + case IF_DR_1D: // cset, csetm + case IF_DR_3D: // csel, csinc, csinv, csneg + + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + // ALU, Conditional compare + case IF_DR_2I: // ccmp , ccmn + + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + // Multiply accumulate + case IF_DR_4A: // madd, msub, smaddl, smsubl, umaddl, umsubl + if (id->idOpSize() == EA_4BYTE) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + } + else + { + assert(id->idOpSize() == EA_8BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_5C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + } + + // Miscellaneous Data Preocessing instructions + case IF_DR_3E: // extr + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DR_2H: // sxtb, sxth, sxtw, uxtb, uxth, sha1h + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DI_2D: // lsl, lsr, asr, sbfm, bfm, ubfm, sbfiz, bfi, ubfiz, sbfx, bfxil, ubfx + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DR_2G: // mov sp, cls, clz, rbit, rev16, rev32, rev + if (ins == INS_rbit) + { + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + } + else + { + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + } + + // + // Load/Store Instructions + // + + case IF_LS_1A: // ldr, ldrsw (literal, pc relative immediate) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + + case IF_LS_2A: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh (no immediate) + // ldar, ldarb, ldarh, ldxr, ldxrb, ldxrh, + // ldaxr, ldaxrb, ldaxrh, stlr, stlrb, stlrh + + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + // ToDo: store release have 2/4 cycle latency + break; + + case IF_LS_2B: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh (scaled immediate) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + + case IF_LS_2C: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh + // ldur, ldurb, ldurh, ldursb, ldursh, ldursw, stur, sturb, sturh + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + + case IF_LS_3A: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb strh (register extend, scale 2,4,8) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + + case IF_LS_3B: // ldp, ldpsw, ldnp, stp, stnp (load/store pair zero offset) + case IF_LS_3C: // load/store pair with offset pre/post inc + if (memAccessKind == PERFSCORE_MEMORY_READ) + { + // ldp, ldpsw, ldnp + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + if (emitIGisInEpilog(emitCurIG) && (ins == INS_ldp)) + { + // Reduce latency for ldp instructions in the epilog + // + result.insLatency = PERFSCORE_LATENCY_2C; + } + else if (id->idOpSize() == EA_8BYTE) // X-form + { + // the X-reg variant has an extra cycle of latency + // and two cycle throughput + result.insLatency += 1.0; + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + } + } + else // store instructions + { + // stp, stnp + assert(memAccessKind == PERFSCORE_MEMORY_WRITE); + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + } + break; + + case IF_LS_3D: // stxr, stxrb, stxrh, stlxr, stlxrb, srlxrh + // Store exclusive register, returning status + assert(emitInsIsStore(ins)); + // @ToDo - find out the actual latency + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = max(PERFSCORE_LATENCY_4C, result.insLatency); + break; + + case IF_LS_3E: // ARMv8.1 LSE Atomics + if (memAccessKind == PERFSCORE_MEMORY_WRITE) + { + // staddb, staddlb, staddh, staddlh, stadd. staddl + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + assert(memAccessKind == PERFSCORE_MEMORY_READ_WRITE); + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = max(PERFSCORE_LATENCY_3C, result.insLatency); + } + break; + + case IF_LS_2D: // ld1 (vector - multiple structures) + case IF_LS_2E: // ld1 (vector - single structure) + case IF_LS_3F: // ld1 (vector - multiple structures) + case IF_LS_3G: // ld1 (vector - single structure) + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case IF_SN_0A: // bkpt, nop + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_ZERO; + break; + + case IF_SI_0B: // dmb, dsb, isb + // @ToDo - find out the actual latency + result.insThroughput = PERFSCORE_THROUGHPUT_10C; + result.insLatency = PERFSCORE_LATENCY_10C; + break; + + case IF_DV_2J: // fcvt Vd Vn + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case IF_DV_2K: // fcmp Vd Vn + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DV_1A: // fmov - immediate (scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DV_1B: // fmov, orr, bic, movi, mvni (immediate vector) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DV_1C: // fcmp vn, #0.0 + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case IF_DV_2A: // fabs, fneg, fsqrt, fcvtXX, frintX, scvtf, ucvtf, fcmXX (vector) + switch (ins) + { + case INS_fabs: + case INS_fneg: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = (id->idOpSize() == EA_8BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C / 2; + break; + + case INS_fsqrt: + if ((id->idInsOpt() == INS_OPTS_2S) || (id->idInsOpt() == INS_OPTS_4S)) + { + // S-form + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_11C; + } + else + { + // D-form + assert(id->idInsOpt() == INS_OPTS_2D); + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_18C; + } + break; + + case INS_fcvtas: + case INS_fcvtau: + case INS_fcvtms: + case INS_fcvtmu: + case INS_fcvtns: + case INS_fcvtnu: + case INS_fcvtps: + case INS_fcvtpu: + case INS_fcvtzs: + case INS_fcvtzu: + case INS_frinta: + case INS_frinti: + case INS_frintm: + case INS_frintn: + case INS_frintp: + case INS_frintx: + case INS_frintz: + case INS_scvtf: + case INS_ucvtf: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_fcmeq: + case INS_fcmge: + case INS_fcmgt: + case INS_fcmle: + case INS_fcmlt: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_2G: // fmov, fabs, fneg, fsqrt, faddp, fcmXX, fcvtXX, frintX, scvtf, ucvtf (scalar) + switch (ins) + { + case INS_fmov: + // FP move, vector register + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_fabs: + case INS_fneg: + case INS_faddp: + + case INS_fcvtas: + case INS_fcvtau: + case INS_fcvtms: + case INS_fcvtmu: + case INS_fcvtns: + case INS_fcvtnu: + case INS_fcvtps: + case INS_fcvtpu: + case INS_fcvtzs: + case INS_fcvtzu: + case INS_scvtf: + case INS_ucvtf: + + case INS_frinta: + case INS_frinti: + case INS_frintm: + case INS_frintn: + case INS_frintp: + case INS_frintx: + case INS_frintz: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_fcvtl: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_fsqrt: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_19C; + result.insLatency = PERFSCORE_LATENCY_22C; + } + else + { + // S-form + assert(id->idOpSize() == EA_4BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_9C; + result.insLatency = PERFSCORE_LATENCY_12C; + } + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_3B: // fadd, fsub, fdiv, fmul, fmulx, fmla, fmls, fmin, fmax, fabd, faddp, fcmXX (vector) + switch (ins) + { + case INS_fmin: + case INS_fmax: + case INS_fabd: + case INS_fadd: + case INS_fsub: + case INS_faddp: + case INS_fmul: + case INS_fmulx: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_fcmeq: + case INS_fcmge: + case INS_fcmgt: + case INS_fcmle: + case INS_fcmlt: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_fdiv: + if ((id->idInsOpt() == INS_OPTS_2S) || (id->idInsOpt() == INS_OPTS_4S)) + { + // S-form + result.insThroughput = PERFSCORE_THROUGHPUT_10C; + result.insLatency = PERFSCORE_LATENCY_13C; + } + else + { + // D-form + assert(id->idInsOpt() == INS_OPTS_2D); + result.insThroughput = PERFSCORE_THROUGHPUT_10C; + result.insLatency = PERFSCORE_LATENCY_22C; + } + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_3BI: // fmul, fmulx, fmla, fmls (vector by elem) + case IF_DV_3AI: // mul, mla, mls (vector by elem) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case IF_DV_4A: // fmadd, fmsub, fnmadd, fnsub (scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case IF_DV_3D: // fadd, fsub, fdiv, fmul, fmulx, fmla, fmls, fmin, fmax, fabd, faddp, fcmXX (scalar) + switch (ins) + { + case INS_fadd: + case INS_fsub: + case INS_fmin: + case INS_fmax: + case INS_fmul: + case INS_fmulx: + case INS_fnmul: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_fdiv: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_15C; + } + else + { + // S-form + assert(id->idOpSize() == EA_4BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_10C; + } + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_2H: // fmov, fcvtXX - to general + // fmov : FP transfer to general register + // fcvtaXX : FP convert from vector to general + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case IF_DV_2I: // fmov, Xcvtf - from general + switch (ins) + { + case INS_fmov: + // FP transfer from general register + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_scvtf: + case INS_ucvtf: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_5C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_3C: // mov,and, bic, eor, mov,mvn, orn, bsl, bit, bif (vector) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case IF_DV_2E: // mov, dup (scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_2F: // mov, ins (element) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_2B: // smov, umov - to general) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_2C: // mov, dup, ins - from general) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + if (ins == INS_dup) + { + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + assert((ins == INS_ins) || (ins == INS_mov)); + result.insLatency = PERFSCORE_LATENCY_2C; + } + break; + + case IF_DV_2D: // dup (dvector) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_3A: // (vector) + // add, sub, mul, mla, mls, cmeq, cmge, cmgt, cmhi, cmhs, ctst, + // pmul, saba, uaba, sabd, uabd, umin, umax, smin, smax, + switch (ins) + { + case INS_add: + case INS_sub: + case INS_cmeq: + case INS_cmge: + case INS_cmgt: + case INS_cmhi: + case INS_cmhs: + case INS_umin: + case INS_umax: + case INS_smin: + case INS_smax: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_ctst: + case INS_pmul: + case INS_sabd: + case INS_uabd: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_mul: + case INS_mla: + case INS_mls: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_saba: + case INS_uaba: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_3DI: // mul, mla, mls (scalar by elem) + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case IF_DV_3E: // add, sub, cmeq, cmge, cmgt, cmhi, cmhs, ctst, (scalar) + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_2L: // abs, neg, cmeq, cmge, cmgt, cmle, cmlt (scalar) + case IF_DV_2M: // (vector) + // abs, neg, mvn, not, cmeq, cmge, cmgt, cmle, cmlt, + // addv, saddlv, uaddlv, smaxv, sminv, umaxv, uminv + // cls, clz, cnt, rbit, rev16, rev32, rev64, + // xtn, xtn2, shll, shll2 + switch (ins) + { + case INS_abs: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_addv: + case INS_saddlv: + case INS_uaddlv: + case INS_cls: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_sminv: + case INS_smaxv: + case INS_uminv: + case INS_umaxv: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + case INS_cmeq: + case INS_cmge: + case INS_cmgt: + case INS_cmle: + case INS_cmlt: + + case INS_clz: + case INS_cnt: + case INS_rbit: + case INS_rev16: + case INS_rev32: + case INS_rev64: + case INS_xtn: + case INS_xtn2: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_mvn: + case INS_not: + case INS_neg: + case INS_shll: + case INS_shll2: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_2N: // sshr, ssra, srshr, srsra, shl, ushr, usra, urshr, ursra, sri, sli (shift by immediate - + // scalar) + case IF_DV_2O: // sshr, ssra, srshr, srsra, shl, ushr, usra, urshr, ursra, sri, sli (shift by immediate - + // vector) + // sshll, sshll2, ushll, ushll2, shrn, shrn2, rshrn, rshrn2, sxrl, sxl2, uxtl, uxtl2 + switch (ins) + { + case INS_shl: + case INS_shrn: + case INS_shrn2: + case INS_sli: + case INS_sri: + case INS_sshr: + case INS_ushr: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_shll: + case INS_shll2: + case INS_sshll: + case INS_sshll2: + case INS_ushll: + case INS_ushll2: + case INS_sxtl: + case INS_sxtl2: + case INS_uxtl: + case INS_uxtl2: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_rshrn: + case INS_rshrn2: + case INS_srshr: + case INS_urshr: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_DV_2P: // aese, aesd, aesmc, aesimc, sha1su1, sha256su0 + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case IF_DV_3F: // sha1c, sha1m, sha1p, sha1su0, sha256h, sha256h2, sha256su1 (vector) + switch (ins) + { + case INS_sha1su0: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + case INS_sha256su0: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_sha1c: + case INS_sha1m: + case INS_sha1p: + case INS_sha256h: + case INS_sha256h2: + case INS_sha256su1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; + } + break; + + case IF_SI_0A: // brk imm16 + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + default: + // all other instructions + perfScoreUnhandledInstruction(id, &result); + break; } return result; diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index 33d8592de0c93..90b254f00d28f 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -14024,10 +14024,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; default: - // all others - assert(!"unreached"); - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; - result.insLatency = PERFSCORE_LATENCY_DEFAULT; + // unhandled instruction insFmt combination + perfScoreUnhandledInstruction(id, &result); break; } break; @@ -14051,9 +14049,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; default: - // all others - assert(!"unreached"); - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; + // unhandled instruction insFmt combination + perfScoreUnhandledInstruction(id, &result); + break; } break; @@ -14084,9 +14082,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; default: - // all others - assert(!"unreached"); - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; + // unhandled instruction insFmt combination + perfScoreUnhandledInstruction(id, &result); break; } break; @@ -14220,8 +14217,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; default: - assert(!"Unhandled insFmt for INS_call"); - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; + // unhandled instruction, insFmt combination + perfScoreUnhandledInstruction(id, &result); break; } break; @@ -15063,12 +15060,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; default: - // static instruction s_ins = ins; - // printf("Unsupported instruction: %s", codeGen->genInsName(ins)); - // assert(!"Unhandled ins for getInsExecutionCharacteristics"); - // all other ins - result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT; - result.insLatency = PERFSCORE_LATENCY_DEFAULT; + // unhandled instruction insFmt combination + perfScoreUnhandledInstruction(id, &result); break; }