diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index ebd6e2a4d4c..c5e0a4ebbd5 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -165,35 +165,27 @@ Commit::CommitStats::CommitStats(CPU *cpu, Commit *commit) ADD_STAT(committedInstType, statistics::units::Count::get(), "Class of committed instruction"), ADD_STAT(commitEligibleSamples, statistics::units::Cycle::get(), - "number cycles where commit BW limit reached") -{ - using namespace statistics; + "number cycles where commit BW limit reached"), + ADD_STAT(committedInst, statistics::units::Count::get(), + "Required for Top-Down, number of committed instructions") { + using namespace statistics; - commitSquashedInsts.prereq(commitSquashedInsts); - commitNonSpecStalls.prereq(commitNonSpecStalls); - branchMispredicts.prereq(branchMispredicts); + commitSquashedInsts.prereq(commitSquashedInsts); + commitNonSpecStalls.prereq(commitNonSpecStalls); + branchMispredicts.prereq(branchMispredicts); - numCommittedDist - .init(0,commit->commitWidth,1) - .flags(statistics::pdf); + numCommittedDist.init(0, commit->commitWidth, 1).flags(statistics::pdf); - amos - .init(cpu->numThreads) - .flags(total); + amos.init(cpu->numThreads).flags(total); - membars - .init(cpu->numThreads) - .flags(total); + membars.init(cpu->numThreads).flags(total); - functionCalls - .init(commit->numThreads) - .flags(total); + functionCalls.init(commit->numThreads).flags(total); - committedInstType - .init(commit->numThreads,enums::Num_OpClass) - .flags(total | pdf | dist); + committedInstType.init(commit->numThreads, enums::Num_OpClass) + .flags(total | pdf | dist); - committedInstType.ysubnames(enums::OpClassStrings); + committedInstType.ysubnames(enums::OpClassStrings); } void @@ -1104,6 +1096,7 @@ Commit::commitInsts() DPRINTF(CommitRate, "%i\n", num_committed); stats.numCommittedDist.sample(num_committed); + stats.committedInst += num_committed; if (num_committed == commitWidth) { stats.commitEligibleSamples++; diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 4fff9fe892d..3f16f1adc87 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -490,7 +490,12 @@ class Commit /** Number of cycles where the commit bandwidth limit is reached. */ statistics::Scalar commitEligibleSamples; + /** Top Down Methodology, Number of commited instructions*/ + statistics::Scalar committedInst; } stats; + + public: + const CommitStats &getStats() const { return stats; } }; } // namespace o3 diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 568ca436637..71ef8865779 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -355,7 +355,8 @@ CPU::CPUStats::CPUStats(CPU *cpu) "to idling"), ADD_STAT(quiesceCycles, statistics::units::Cycle::get(), "Total number of cycles that CPU has spent quiesced or waiting " - "for an interrupt") + "for an interrupt"), + topDownStats(cpu) { // Register any of the O3CPU's stats here. timesIdled @@ -368,6 +369,198 @@ CPU::CPUStats::CPUStats(CPU *cpu) .prereq(quiesceCycles); } +CPU::CPUStats::TopDownStats::TopDownStats(CPU *cpu) + : statistics::Group(cpu, "TopDownStats"), topDownL1(cpu), topDownFbL2(cpu), + topDownBsL2(cpu), topDownBbL2(cpu), topDownBbMem(cpu) {} + +CPU::CPUStats::TopDownStats::TopDownL1::TopDownL1(CPU *cpu) + : statistics::Group(cpu, "TopDownL1"), + ADD_STAT(frontendBound, + statistics::units::Rate::get(), + "Frontend Bound, fraction of slots lost due to frontend " + "undersupplying the backend"), + ADD_STAT(badSpeculation, + statistics::units::Rate::get(), + "Bad Speculation, fraction of slots lost due to mispeculation"), + ADD_STAT(backendBound, + statistics::units::Rate::get(), + "Backend Bound, fraction of slots lost due to backend resource" + " constraints."), + ADD_STAT( + retiring, + statistics::units::Rate::get(), + "Retiring, fraction of slots successfully retired by the backend") { + + // Total Slots + statistics::Temp totalSlots = + cpu->rename.getWidth() * cpu->baseStats.numCycles; + + // L1 Frontend Bound + frontendBound = cpu->fetch.getStats().fetchBubbles / (totalSlots); + + // L1 Bad Speculation + // Recovery cycles for mispredictions detected at Decode + int recoveryCycleToDecode = cpu->decode.getFetchToDecodeDelay(); + + auto decodeBranchMispred = + (int)recoveryCycleToDecode * cpu->decode.getStats().branchMispred; + + // Recovery cycles for mispredictions detected at IEW + int recoveryCycleToIEW = cpu->decode.getFetchToDecodeDelay() + + cpu->rename.getDecodeToRenameDelay() + + cpu->iew.getRenameToIEWDelay(); + + auto iewBadSpec = + (int)recoveryCycleToIEW * (cpu->iew.getStats().branchMispredicts + + cpu->iew.getStats().memOrderViolationEvents); + + // Number of wasted slots due to bad speculation + auto wastedSlots = cpu->rename.getStats().renamedInsts - + cpu->commit.getStats().committedInst; + + badSpeculation = (wastedSlots + (decodeBranchMispred + iewBadSpec) * + cpu->rename.getWidth()) / + (totalSlots); + + // L1 Retiring + retiring = cpu->commit.getStats().committedInst / (totalSlots); + + // L1 Backend Bound + backendBound = 1 - (frontendBound + badSpeculation + retiring); +} + +CPU::CPUStats::TopDownStats::TopDownFrontendBoundL2::TopDownFrontendBoundL2( + CPU *cpu) + : statistics::Group(cpu, "TopDownL2_FrontendBound"), + ADD_STAT(fetchLatency, + statistics::units::Rate::get(), + "Fetch Latency Bound, frontend stalls due to instruction cache " + "inefficiency"), + ADD_STAT(fetchBandwidth, + statistics::units::Rate::get(), + "Fetch Bandwidth Bound, frontend stalls due to decoder " + "inefficiency") { + // Frontend L2 + fetchLatency = + cpu->fetch.getStats().fetchBubblesMax / (cpu->baseStats.numCycles); + fetchBandwidth = + cpu->cpuStats.topDownStats.topDownL1.frontendBound - fetchLatency; +} + +CPU::CPUStats::TopDownStats::TopDownBadSpeculationL2 ::TopDownBadSpeculationL2( + CPU *cpu) + : statistics::Group(cpu, "TopDownL2_BadSpeculation"), + ADD_STAT(branchMissPredicts, + statistics::units::Rate::get(), + "Branch Miss Predicts"), + ADD_STAT(machineClears, + statistics::units::Rate::get(), + "Memory Order Violations") { + auto &iewMissPred = cpu->iew.getStats().branchMispredicts; + auto &decodeMissPred = cpu->decode.getStats().branchMispred; + auto &memOrderViolations = cpu->iew.getStats().memOrderViolationEvents; + + auto brMispredictFraction = + (iewMissPred + decodeMissPred) / + (iewMissPred + decodeMissPred + memOrderViolations); + + branchMissPredicts = brMispredictFraction * + cpu->cpuStats.topDownStats.topDownL1.badSpeculation; + + machineClears = + cpu->cpuStats.topDownStats.topDownL1.badSpeculation - branchMissPredicts; +} + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL2::TopDownBackendBoundL2( + CPU *cpu) + : statistics::Group(cpu, "TopDownL2_BackendBound"), + ADD_STAT(memoryBound, + statistics::units::Rate::get(), + "Memory Bound, backend stalls due to memory subsystem"), + ADD_STAT( + coreBound, + statistics::units::Rate::get(), + "Core Bound, backend stalls due to functional unit constraints") { + // Backend L2 + executionStalls = (cpu->iew.instQueue.getStats().numInstsExec0 - + cpu->rename.getStats().idleCycles + + cpu->iew.instQueue.getStats().numInstsExec1 + + cpu->iew.instQueue.getStats().numInstsExec2) / + (cpu->baseStats.numCycles); + auto memoryBoundRaw = (cpu->iew.instQueue.getStats().loadStallCycles + + cpu->rename.getStats().storeStalls) / + (cpu->baseStats.numCycles); + auto coreBoundRaw = executionStalls - memoryBoundRaw; + + auto &totalBackendBound = cpu->cpuStats.topDownStats.topDownL1.backendBound; + + memoryBound = + memoryBoundRaw / (memoryBoundRaw + coreBoundRaw) * (totalBackendBound); + coreBound = + coreBoundRaw / (memoryBoundRaw + coreBoundRaw) * (totalBackendBound); +} + +CPU::CPUStats::TopDownStats::TopDownBackendBoundL3::TopDownBackendBoundL3( + CPU *cpu) + : statistics::Group(cpu, "TopDownL3_BackendBound_MemoryBound"), + ADD_STAT(l1Bound, + statistics::units::Rate::get(), + "L1 Cache Bound"), + ADD_STAT(l2Bound, + statistics::units::Rate::get(), + "L2 Cache Bound"), + ADD_STAT(l3Bound, + statistics::units::Rate::get(), + "L3 Cache Bound"), + ADD_STAT(extMemBound, + statistics::units::Rate::get(), + "External Memory Bound"), + ADD_STAT(storeBound, + statistics::units::Rate::get(), + "Store Bound") { + + auto &totalBackendBound = cpu->cpuStats.topDownStats.topDownBbL2.memoryBound; + + // Backend Bound / Memory Bound L3 + auto l1BoundRaw = (cpu->iew.instQueue.getStats().loadStallCycles - + cpu->iew.instQueue.getStats().L1miss) / + (cpu->baseStats.numCycles); + auto l2BoundRaw = (cpu->iew.instQueue.getStats().L1miss - + cpu->iew.instQueue.getStats().L2miss) / + (cpu->baseStats.numCycles); + auto l3BoundRaw = (cpu->iew.instQueue.getStats().L2miss - + cpu->iew.instQueue.getStats().L3miss) / + (cpu->baseStats.numCycles); + auto extMemBoundRaw = + (cpu->iew.instQueue.getStats().L3miss) / (cpu->baseStats.numCycles); + auto storeBoundRaw = + (cpu->rename.getStats().storeStalls) / (cpu->baseStats.numCycles); + + auto totalMemoryBound = + l1BoundRaw + l2BoundRaw + l3BoundRaw + extMemBoundRaw + storeBoundRaw; + + l1Bound = l1BoundRaw / totalMemoryBound * totalBackendBound; + l2Bound = l2BoundRaw / totalMemoryBound * totalBackendBound; + l3Bound = l3BoundRaw / totalMemoryBound * totalBackendBound; + extMemBound = extMemBoundRaw / totalMemoryBound * totalBackendBound; + storeBound = storeBoundRaw / totalMemoryBound * totalBackendBound; +} + void CPU::tick() { diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index fcc34d0f986..0b8d0e43135 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -601,6 +601,56 @@ class CPU : public BaseCPU /** Stat for total number of cycles the CPU spends descheduled due to a * quiesce operation or waiting for an interrupt. */ statistics::Scalar quiesceCycles; + + struct TopDownStats : statistics::Group { + TopDownStats(CPU *cpu); + + struct TopDownL1 : statistics::Group { + TopDownL1(CPU *cpu); + statistics::Formula frontendBound; + statistics::Formula badSpeculation; + statistics::Formula backendBound; + statistics::Formula retiring; + } topDownL1; + + struct TopDownFrontendBoundL2 : statistics::Group { + TopDownFrontendBoundL2(CPU *cpu); + statistics::Formula fetchLatency; + statistics::Formula fetchBandwidth; + } topDownFbL2; + + struct TopDownBadSpeculationL2 : statistics::Group{ + TopDownBadSpeculationL2(CPU *cpu); + statistics::Formula branchMissPredicts; + statistics::Formula machineClears; + } topDownBsL2; + + struct TopDownBackendBoundL2 : statistics::Group { + TopDownBackendBoundL2(CPU *cpu); + statistics::Formula executionStalls; + statistics::Formula memoryBound; + statistics::Formula coreBound; + } topDownBbL2; + + struct TopDownBackendBoundL3 : statistics::Group { + TopDownBackendBoundL3(CPU *cpu); + statistics::Formula l1Bound; + statistics::Formula l2Bound; + statistics::Formula l3Bound; + statistics::Formula extMemBound; + statistics::Formula storeBound; + } topDownBbMem; + + // struct TopDownFrontendBoundL3 : statistics::Group { + // TopDownFrontendBoundL3(CPU *cpu); + // statistics::Formula iTlbMiss; + // statistics::Formula iCacheMiss; + // statistics::Formula branchResteer; + // statistics::Formula others; + // } topDownFlL3; + + } topDownStats; + } cpuStats; public: diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index c5123663185..13176fc3715 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -158,7 +158,13 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) ADD_STAT(decodedInsts, statistics::units::Count::get(), "Number of instructions handled by decode"), ADD_STAT(squashedInsts, statistics::units::Count::get(), - "Number of squashed instructions handled by decode") + "Number of squashed instructions handled by decode"), + ADD_STAT(fetchBubbles, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of instructions not " + "delivered to backend"), + ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of cycles in which no " + "instructions are delivered to backend") { idleCycles.prereq(idleCycles); blockedCycles.prereq(blockedCycles); @@ -170,6 +176,8 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) controlMispred.prereq(controlMispred); decodedInsts.prereq(decodedInsts); squashedInsts.prereq(squashedInsts); + fetchBubbles.prereq(fetchBubbles); + fetchBubblesMax.prereq(fetchBubblesMax); } void @@ -565,6 +573,8 @@ Decode::tick() toRenameIndex = 0; + fetchBubbles = decodeWidth; + list::iterator threads = activeThreads->begin(); list::iterator end = activeThreads->end(); @@ -578,6 +588,13 @@ Decode::tick() status_change = checkSignalsAndUpdate(tid) || status_change; decode(status_change, tid); + + // Check if branch missprediction is detected while decoding + if (!(decodeStatus[tid] == Squashing)) { + stats.fetchBubbles += fetchBubbles; + if (fetchBubbles == decodeWidth) + stats.fetchBubblesMax++; + } } if (status_change) { @@ -602,9 +619,11 @@ Decode::decode(bool &status_change, ThreadID tid) // check if stall conditions have passed if (decodeStatus[tid] == Blocked) { - ++stats.blockedCycles; + fetchBubbles -= decodeWidth; + ++stats.blockedCycles; } else if (decodeStatus[tid] == Squashing) { - ++stats.squashCycles; + fetchBubbles -= decodeWidth; + ++stats.squashCycles; } // Decode should try to decode as many instructions as its bandwidth @@ -702,6 +721,7 @@ Decode::decodeInsts(ThreadID tid) ++toRenameIndex; ++stats.decodedInsts; --insts_available; + --fetchBubbles; #if TRACING_ON if (debug::O3PipeView) { diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 6b0e20ea281..5dbab781f96 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -299,6 +299,8 @@ class Decode */ bool squashAfterDelaySlot[MaxThreads]; + unsigned fetchBubbles = 0; + struct DecodeStats : public statistics::Group { DecodeStats(CPU *cpu); @@ -325,7 +327,18 @@ class Decode statistics::Scalar decodedInsts; /** Stat for total number of squashed instructions. */ statistics::Scalar squashedInsts; + /** Stat for Top-Down Methodology, number of instructions not delivered + * to backend */ + statistics::Scalar fetchBubbles; + /** Stat for Top-Down Methodology, number of cycles in which no + * instructions are delivered to backend */ + statistics::Scalar fetchBubblesMax; } stats; + + public: + const DecodeStats &getStats() const { return stats; } + + Cycles getFetchToDecodeDelay() { return fetchToDecodeDelay; } }; } // namespace o3 diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index c4e76fdaf4b..d51fe29f232 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -159,85 +159,73 @@ Fetch::regProbePoints() Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) : statistics::Group(cpu, "fetch"), - ADD_STAT(predictedBranches, statistics::units::Count::get(), - "Number of branches that fetch has predicted taken"), - ADD_STAT(cycles, statistics::units::Cycle::get(), - "Number of cycles fetch has run and was not squashing or " - "blocked"), - ADD_STAT(squashCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent squashing"), - ADD_STAT(tlbCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting for tlb"), - ADD_STAT(ftqStallCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting for FTQ to fill."), - ADD_STAT(idleCycles, statistics::units::Cycle::get(), - "Number of cycles fetch was idle"), - ADD_STAT(blockedCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent blocked"), - ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting on interrupts, or bad " - "addresses, or out of MSHRs"), - ADD_STAT(pendingDrainCycles, statistics::units::Cycle::get(), - "Number of cycles fetch has spent waiting on pipes to drain"), - ADD_STAT(noActiveThreadStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to no active thread to fetch from"), - ADD_STAT(pendingTrapStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to pending traps"), - ADD_STAT(pendingQuiesceStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to pending quiesce instructions"), - ADD_STAT(icacheWaitRetryStallCycles, statistics::units::Cycle::get(), - "Number of stall cycles due to full MSHR"), - ADD_STAT(cacheLines, statistics::units::Count::get(), - "Number of cache lines fetched"), - ADD_STAT(icacheSquashes, statistics::units::Count::get(), - "Number of outstanding Icache misses that were squashed"), - ADD_STAT(tlbSquashes, statistics::units::Count::get(), - "Number of outstanding ITLB misses that were squashed"), - ADD_STAT(nisnDist, statistics::units::Count::get(), - "Number of instructions fetched each cycle (Total)"), - ADD_STAT(idleRate, statistics::units::Ratio::get(), - "Ratio of cycles fetch was idle", - idleCycles / cpu->baseStats.numCycles) -{ - predictedBranches - .prereq(predictedBranches); - cycles - .prereq(cycles); - squashCycles - .prereq(squashCycles); - tlbCycles - .prereq(tlbCycles); - ftqStallCycles - .prereq(ftqStallCycles); - idleCycles - .prereq(idleCycles); - blockedCycles - .prereq(blockedCycles); - cacheLines - .prereq(cacheLines); - miscStallCycles - .prereq(miscStallCycles); - pendingDrainCycles - .prereq(pendingDrainCycles); - noActiveThreadStallCycles - .prereq(noActiveThreadStallCycles); - pendingTrapStallCycles - .prereq(pendingTrapStallCycles); - pendingQuiesceStallCycles - .prereq(pendingQuiesceStallCycles); - icacheWaitRetryStallCycles - .prereq(icacheWaitRetryStallCycles); - icacheSquashes - .prereq(icacheSquashes); - tlbSquashes - .prereq(tlbSquashes); - nisnDist - .init(/* base value */ 0, - /* last value */ fetch->fetchWidth, - /* bucket size */ 1) - .flags(statistics::pdf); - idleRate - .prereq(idleRate); + ADD_STAT(predictedBranches, statistics::units::Count::get(), + "Number of branches that fetch has predicted taken"), + ADD_STAT(cycles, statistics::units::Cycle::get(), + "Number of cycles fetch has run and was not squashing or " + "blocked"), + ADD_STAT(squashCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent squashing"), + ADD_STAT(tlbCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting for tlb"), + ADD_STAT(ftqStallCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting for FTQ to fill."), + ADD_STAT(idleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle"), + ADD_STAT(blockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked"), + ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting on interrupts, or bad " + "addresses, or out of MSHRs"), + ADD_STAT(pendingDrainCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent waiting on pipes to drain"), + ADD_STAT(noActiveThreadStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to no active thread to fetch from"), + ADD_STAT(pendingTrapStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to pending traps"), + ADD_STAT(pendingQuiesceStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to pending quiesce instructions"), + ADD_STAT(icacheWaitRetryStallCycles, statistics::units::Cycle::get(), + "Number of stall cycles due to full MSHR"), + ADD_STAT(cacheLines, statistics::units::Count::get(), + "Number of cache lines fetched"), + ADD_STAT(icacheSquashes, statistics::units::Count::get(), + "Number of outstanding Icache misses that were squashed"), + ADD_STAT(tlbSquashes, statistics::units::Count::get(), + "Number of outstanding ITLB misses that were squashed"), + ADD_STAT(nisnDist, statistics::units::Count::get(), + "Number of instructions fetched each cycle (Total)"), + ADD_STAT(idleRate, statistics::units::Ratio::get(), + "Ratio of cycles fetch was idle", + idleCycles / cpu->baseStats.numCycles), + ADD_STAT(fetchBubbles, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of instructions not " + "delivered to backend"), + ADD_STAT(fetchBubblesMax, statistics::units::Count::get(), + "Stat for Top-Down Methodology, number of cycles in which no " + "instructions are delivered to backend") { + predictedBranches.prereq(predictedBranches); + cycles.prereq(cycles); + squashCycles.prereq(squashCycles); + tlbCycles.prereq(tlbCycles); + ftqStallCycles.prereq(ftqStallCycles); + idleCycles.prereq(idleCycles); + blockedCycles.prereq(blockedCycles); + cacheLines.prereq(cacheLines); + miscStallCycles.prereq(miscStallCycles); + pendingDrainCycles.prereq(pendingDrainCycles); + noActiveThreadStallCycles.prereq(noActiveThreadStallCycles); + pendingTrapStallCycles.prereq(pendingTrapStallCycles); + pendingQuiesceStallCycles.prereq(pendingQuiesceStallCycles); + icacheWaitRetryStallCycles.prereq(icacheWaitRetryStallCycles); + icacheSquashes.prereq(icacheSquashes); + tlbSquashes.prereq(tlbSquashes); + nisnDist + .init(/* base value */ 0, + /* last value */ fetch->fetchWidth, + /* bucket size */ 1) + .flags(statistics::pdf); + idleRate.prereq(idleRate); } void Fetch::setTimeBuffer(TimeBuffer *time_buffer) @@ -913,6 +901,20 @@ Fetch::tick() tid_itr = activeThreads->begin(); } + bool backendStall = false; + + for (ThreadID i = 0; i < numThreads; ++i) { + if ((fetchStatus[i] == Squashing) || (stalls[i].decode) || + (fetchStatus[i] == Blocked)) + backendStall = true; + } + + if (!backendStall) { + fetchStats.fetchBubbles += (fetchWidth - insts_to_decode); + if (insts_to_decode == 0) + fetchStats.fetchBubblesMax++; + } + // If there was activity this cycle, inform the CPU of it. if (wroteToTimeBuffer) { DPRINTF(Activity, "Activity this cycle.\n"); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 05d92e37567..0ce07064df0 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -591,7 +591,18 @@ class Fetch statistics::Distribution nisnDist; /** Rate of how often fetch was idle. */ statistics::Formula idleRate; + /** Stat for Top-Down Methodology, number of instructions not delivered + * to backend */ + statistics::Scalar fetchBubbles; + /** Stat for Top-Down Methodology, number of cycles in which no + * instructions are delivered to backend */ + statistics::Scalar fetchBubblesMax; } fetchStats; + + public: + const FetchStatGroup &getStats() const { return fetchStats; } + + ThreadStatus getStatus(int tid) { return fetchStatus[tid]; } }; } // namespace o3 diff --git a/src/cpu/o3/fu_pool.hh b/src/cpu/o3/fu_pool.hh index f0f01c38d40..7589ce18e72 100644 --- a/src/cpu/o3/fu_pool.hh +++ b/src/cpu/o3/fu_pool.hh @@ -203,6 +203,16 @@ class FUPool : public SimObject /** Takes over from another CPU's thread. */ void takeOverFrom() {}; + + /** Returns the number of free FUs */ + int numBusyFUs() const { + int busy = 0; + for (int i = 0; i < numFU; ++i) { + if (unitBusy[i]) + busy++; + } + return busy; + } }; } // namespace o3 diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index a01c6b9deca..db491315b40 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1331,6 +1331,7 @@ IEW::executeInsts() ++iewStats.memOrderViolationEvents; } + } else { // Reset any state associated with redirects that will not // be used. diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 4fe8227dcc8..3b4b463e1d0 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -414,7 +414,6 @@ class IEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; - struct IEWStats : public statistics::Group { IEWStats(CPU *cpu); @@ -475,6 +474,11 @@ class IEW /** Average number of woken instructions per writeback. */ statistics::Formula wbFanout; } iewStats; + + public: + const IEWStats &getStats() const { return iewStats; } + + Cycles getRenameToIEWDelay() { return renameToIEWDelay; } }; } // namespace o3 diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index b3cf330c37e..04ff7c94d08 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -180,47 +180,58 @@ InstructionQueue::name() const InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) : statistics::Group(cpu), ADD_STAT(instsAdded, statistics::units::Count::get(), - "Number of instructions added to the IQ (excludes non-spec)"), + "Number of instructions added to the IQ (excludes non-spec)"), ADD_STAT(nonSpecInstsAdded, statistics::units::Count::get(), - "Number of non-speculative instructions added to the IQ"), + "Number of non-speculative instructions added to the IQ"), ADD_STAT(instsIssued, statistics::units::Count::get(), - "Number of instructions issued"), + "Number of instructions issued"), ADD_STAT(intInstsIssued, statistics::units::Count::get(), - "Number of integer instructions issued"), + "Number of integer instructions issued"), ADD_STAT(floatInstsIssued, statistics::units::Count::get(), - "Number of float instructions issued"), + "Number of float instructions issued"), ADD_STAT(branchInstsIssued, statistics::units::Count::get(), - "Number of branch instructions issued"), + "Number of branch instructions issued"), ADD_STAT(memInstsIssued, statistics::units::Count::get(), - "Number of memory instructions issued"), + "Number of memory instructions issued"), ADD_STAT(miscInstsIssued, statistics::units::Count::get(), - "Number of miscellaneous instructions issued"), + "Number of miscellaneous instructions issued"), ADD_STAT(squashedInstsIssued, statistics::units::Count::get(), - "Number of squashed instructions issued"), + "Number of squashed instructions issued"), ADD_STAT(squashedInstsExamined, statistics::units::Count::get(), - "Number of squashed instructions iterated over during squash; " - "mainly for profiling"), + "Number of squashed instructions iterated over during squash; " + "mainly for profiling"), ADD_STAT(squashedOperandsExamined, statistics::units::Count::get(), - "Number of squashed operands that are examined and possibly " - "removed from graph"), + "Number of squashed operands that are examined and possibly " + "removed from graph"), ADD_STAT(squashedNonSpecRemoved, statistics::units::Count::get(), - "Number of squashed non-spec instructions that were removed"), + "Number of squashed non-spec instructions that were removed"), ADD_STAT(numIssuedDist, statistics::units::Count::get(), - "Number of insts issued each cycle"), + "Number of insts issued each cycle"), ADD_STAT(statFuBusy, statistics::units::Count::get(), - "attempts to use FU when none available"), + "attempts to use FU when none available"), ADD_STAT(statIssuedInstType, statistics::units::Count::get(), - "Number of instructions issued per FU type, per thread"), + "Number of instructions issued per FU type, per thread"), ADD_STAT(issueRate, statistics::units::Rate< statistics::units::Count, statistics::units::Cycle>::get(), - "Inst issue rate", instsIssued / cpu->baseStats.numCycles), + "Inst issue rate", instsIssued / cpu->baseStats.numCycles), ADD_STAT(fuBusy, statistics::units::Count::get(), "FU busy when requested"), ADD_STAT(fuBusyRate, statistics::units::Rate< statistics::units::Count, statistics::units::Count>::get(), - "FU busy rate (busy events/executed inst)") + "FU busy rate (busy events/executed inst)"), + ADD_STAT(numInstsExec0, statistics::units::Count::get(), + "0 instructions executed in a cycle"), + ADD_STAT(numInstsExec1, statistics::units::Count::get(), + "1 instruction executed in a cycle"), + ADD_STAT(numInstsExec2, statistics::units::Count::get(), + "2 instructions executed in a cycle"), + ADD_STAT(loadStallCycles, statistics::units::Cycle::get(), + "Top down, no uops executed and at least 1 in-flight load"), + ADD_STAT(L1miss, statistics::units::Cycle::get(), "l1miss"), + ADD_STAT(L2miss, statistics::units::Cycle::get(), "l2miss"), + ADD_STAT(L3miss, statistics::units::Cycle::get(), "l1miss") { - instsAdded - .prereq(instsAdded); + instsAdded. + prereq(instsAdded); nonSpecInstsAdded .prereq(nonSpecInstsAdded); @@ -254,54 +265,52 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) squashedNonSpecRemoved .prereq(squashedNonSpecRemoved); -/* - queueResDist - .init(Num_OpClasses, 0, 99, 2) - .name(name() + ".IQ:residence:") - .desc("cycles from dispatch to issue") - .flags(total | pdf | cdf ) - ; - for (int i = 0; i < Num_OpClasses; ++i) { - queueResDist.subname(i, opClassStrings[i]); - } -*/ + /* + queueResDist + .init(Num_OpClasses, 0, 99, 2) + .name(name() + ".IQ:residence:") + .desc("cycles from dispatch to issue") + .flags(total | pdf | cdf ) + ; + for (int i = 0; i < Num_OpClasses; ++i) { + queueResDist.subname(i, opClassStrings[i]); + } + */ numIssuedDist .init(0,total_width,1) .flags(statistics::pdf) ; -/* - dist_unissued - .init(Num_OpClasses+2) - .name(name() + ".unissued_cause") - .desc("Reason ready instruction not issued") - .flags(pdf | dist) - ; - for (int i=0; i < (Num_OpClasses + 2); ++i) { - dist_unissued.subname(i, unissued_names[i]); - } -*/ - statIssuedInstType - .init(cpu->numThreads,enums::Num_OpClass) - .flags(statistics::total | statistics::pdf | statistics::dist) - ; - statIssuedInstType.ysubnames(enums::OpClassStrings); - - // - // How long did instructions for a particular FU type wait prior to issue - // -/* - issueDelayDist - .init(Num_OpClasses,0,99,2) - .name(name() + ".") - .desc("cycles from operands ready to issue") - .flags(pdf | cdf) - ; - for (int i=0; inumThreads, enums::Num_OpClass) + .flags(statistics::total | statistics::pdf | statistics::dist); + statIssuedInstType.ysubnames(enums::OpClassStrings); + + // + // How long did instructions for a particular FU type wait prior to issue + // + /* + issueDelayDist + .init(Num_OpClasses,0,99,2) + .name(name() + ".") + .desc("cycles from operands ready to issue") + .flags(pdf | cdf) + ; + for (int i=0; inumThreads) @@ -323,6 +332,14 @@ InstructionQueue::IQStats::IQStats(CPU *cpu, const unsigned &total_width) .flags(statistics::total) ; fuBusyRate = fuBusy / instsIssued; + + numInstsExec0.prereq(numInstsExec0); + numInstsExec1.prereq(numInstsExec1); + numInstsExec2.prereq(numInstsExec2); + + L1miss.prereq(L1miss); + L2miss.prereq(L2miss); + L3miss.prereq(L3miss); } InstructionQueue::IQIOStats::IQIOStats(statistics::Group *parent) @@ -928,6 +945,30 @@ InstructionQueue::scheduleReadyInsts() } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); } + + int numBusyFUs = fuPool->numBusyFUs(); + + if (numBusyFUs == 0) + iqStats.numInstsExec0++; + else if (numBusyFUs == 1) + iqStats.numInstsExec1++; + else if (numBusyFUs == 2) + iqStats.numInstsExec2++; + + if (fuPool->isDrained() && + iewStage->ldstQueue.numLoads()) { // numLoads returns for all threads, + // change it to a single thread + iqStats.loadStallCycles++; + if (iewStage->ldstQueue.anyCacheLevelMisses(3)) { + iqStats.L1miss++; + iqStats.L2miss++; + iqStats.L3miss++; + } else if (iewStage->ldstQueue.anyCacheLevelMisses(2)) { + iqStats.L1miss++; + iqStats.L2miss++; + } else if (iewStage->ldstQueue.anyCacheLevelMisses(1)) + iqStats.L1miss++; + } } void diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 57928e74784..7b50b2b5380 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -541,6 +541,17 @@ class InstructionQueue statistics::Vector fuBusy; /** Number of times the FU was busy per instruction issued. */ statistics::Formula fuBusyRate; + + /* Top down, cycles in which few ops are executed */ + statistics::Scalar numInstsExec0; + statistics::Scalar numInstsExec1; + statistics::Scalar numInstsExec2; + + /*Top down, MemStalls.AnyLoad*/ + statistics::Scalar loadStallCycles; + statistics::Scalar L1miss; + statistics::Scalar L2miss; + statistics::Scalar L3miss; } iqStats; public: @@ -561,6 +572,9 @@ class InstructionQueue statistics::Scalar fpAluAccesses; statistics::Scalar vecAluAccesses; } iqIOStats; + + public: + const IQStats &getStats() const { return iqStats; } }; } // namespace o3 diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index ad63fef633c..f81ead16cf2 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -452,6 +452,20 @@ LSQ::sendRetryResp() dcachePort.sendRetryResp(); } +bool +LSQ::anyCacheLevelMisses(int level) { + for (LSQUnit &unit : thread) { + for (auto &entry : unit.loadQueue) { + if (entry.valid() && entry.hasRequest()) { + auto req = entry.request()->mainReq(); + if (req->getAccessDepth() == level) + return true; + } + } + } + return false; +} + bool LSQ::recvTimingResp(PacketPtr pkt) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index c208a9fd378..d88d1aae667 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -923,6 +923,8 @@ class LSQ void sendRetryResp(); + bool anyCacheLevelMisses(int level); + protected: /** D-cache is blocked */ bool _cacheBlocked; diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 83fd67f457d..8ded3719324 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -45,6 +45,7 @@ #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst.hh" +#include "cpu/o3/fu_pool.hh" #include "cpu/o3/limits.hh" #include "cpu/reg_class.hh" #include "debug/Activity.hh" @@ -147,10 +148,12 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(skidInsts, statistics::units::Count::get(), "count of insts added to the skid buffer"), ADD_STAT(intReturned, statistics::units::Count::get(), - "count of registers freed and written back to integer free list"), + "count of registers freed and written back to integer free list"), ADD_STAT(fpReturned, statistics::units::Count::get(), - "count of registers freed and written back to floating point free list") - + "count of registers freed and written back to floating point free list"), + ADD_STAT(storeStalls, statistics::units::Cycle::get(), + "Number of cycles with few uops executed and no more stores" + "can be issued") { squashCycles.prereq(squashCycles); idleCycles.prereq(idleCycles); @@ -184,6 +187,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) intReturned.prereq(intReturned); fpReturned.prereq(fpReturned); + storeStalls.prereq(storeStalls); } void @@ -649,6 +653,9 @@ Rename::renameInsts(ThreadID tid) tid); source = SQ; incrFullStat(source); + if (iew_ptr->fuPool->isDrained()) { + stats.storeStalls++; + } break; } } diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 0782645b29c..c8cfcb75416 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -540,7 +540,16 @@ class Rename statistics::Scalar intReturned; /** Number of registers freed and written back to floating point free list*/ statistics::Scalar fpReturned; + /** Top Down, IEW stall while there is an in flight load */ + statistics::Scalar storeStalls; } stats; + + public: + const RenameStats &getStats() const { return stats; } + + unsigned getWidth() const { return renameWidth; } + + int getDecodeToRenameDelay() { return decodeToRenameDelay; } }; } // namespace o3