Skip to content

Commit

Permalink
mem-ruby: Reduce handshaking between CorePair and dir
Browse files Browse the repository at this point in the history
Currently when data is downgraded by MOESI_AMD_Base-CorePair (e.g. due to a replacement)
this requires a 4-way handshake between the CorePair and the dir.
Specifically, the CorePair send a message telling the dir it'd like to downgrade then,
the dir sends an ACK back and then, the CorePair writes the data back, and finally,
the dir ACKs the writeback.
This is very inefficient and not representative of how modern protocols downgrade a request.
Accordingly, this commits updates the downgrade support such that the CorePair writes back
the data immediately and then the dir ACKs it.
Thus, this approach requires only a 2-way handshake.

Change-Id: I7ebc85bb03e8ce46a8847e3240fc170120e9fcd6
  • Loading branch information
Neeraj Surawar committed May 16, 2024
1 parent 0df5635 commit d8c01e0
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 23 deletions.
27 changes: 11 additions & 16 deletions src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
out_msg.DataBlk := cache_entry.DataBlk;
assert(cache_entry.Dirty);
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.Type := CoherenceRequestType:VicDirty;
out_msg.InitialRequestTime := curCycle();
if (cache_entry.CacheState == State:O) {
Expand Down Expand Up @@ -1114,20 +1114,19 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}
}

//With the change to directly WB data when a line is downgraded
//this action is no longer used. However, we are keeping it around
//for reference, if future changes need to WB data to the dir directly
action(wb_data, "wb", desc="write back data") {
peek(responseToCore_in, ResponseMsg) {
peek(mandatoryQueue_in, RubyRequest) {
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:CPUData;
out_msg.Sender := machineID;
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.DataBlk := tbe.DataBlk;
out_msg.Dirty := tbe.Dirty;
if (tbe.Shared) {
out_msg.NbReqShared := true;
} else {
out_msg.NbReqShared := false;
}
out_msg.DataBlk := cache_entry.DataBlk; // Data only used for WBs
out_msg.Dirty := cache_entry.Dirty;
out_msg.NbReqShared := false;
out_msg.State := CoherenceState:Shared; // faux info
out_msg.MessageSize := MessageSizeType:Writeback_Data;
DPRINTF(RubySlicc, "%s\n", out_msg);
Expand Down Expand Up @@ -1682,7 +1681,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
p_popMandatoryQueue;
}

transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} {
transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayWrite} {
l2m_profileMiss; // permissions miss
l1im_profileMiss;
ai_allocateL1I;
Expand All @@ -1693,7 +1692,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
p_popMandatoryQueue;
}

transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } {
transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayWrite, L2DataArrayRead } {
l2m_profileMiss; // permissions miss
l1im_profileMiss;
ai_allocateL1I;
Expand Down Expand Up @@ -2065,7 +2064,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
ii_invIcache;
}

transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayWrite, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
forward_eviction_to_cpu0;
forward_eviction_to_cpu1;
t_allocateTBE;
Expand Down Expand Up @@ -2427,19 +2426,16 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}

transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
wb_data;
d_deallocateTBE;
pr_popResponseQueue;
}

transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
wb_data;
d_deallocateTBE;
pr_popResponseQueue;
}

transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
wb_data;
i2_invL2;
a2_allocateL2;
d_deallocateTBE; // FOO
Expand All @@ -2448,7 +2444,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}

transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
wb_data;
i2_invL2;
a2_allocateL2;
d_deallocateTBE; // FOO
Expand Down
37 changes: 30 additions & 7 deletions src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
} else if (in_msg.Type == CoherenceResponseType:CPUData) {
//Deprecated - WriteBack data (wb_data) is now sent with VicDirty itself
//Leaving this in for reference
assert(0);
trigger(Event:CPUData, in_msg.addr, entry, tbe);
} else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
trigger(Event:StaleWB, in_msg.addr, entry, tbe);
Expand Down Expand Up @@ -968,7 +971,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}

action(d_writeDataToMemory, "d", desc="Write data to memory") {
peek(responseNetwork_in, ResponseMsg) {
peek(requestNetwork_in, CPURequestMsg) {
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
out_msg.addr := address;
out_msg.Type := MemoryRequestType:MEMORY_WB;
Expand Down Expand Up @@ -1175,12 +1178,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}

action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
peek(responseNetwork_in, ResponseMsg) {
peek(requestNetwork_in, CPURequestMsg) {
if (L3CacheMemory.isTagPresent(address)) {
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
entry.DataBlk := in_msg.DataBlk;
entry.LastSender := in_msg.Sender;
entry.LastSender := in_msg.Requestor;
assert(is_valid(tbe));
//The controller always allocates a TBE entry upon receipt of a request from L2 caches.
//L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
Expand All @@ -1205,7 +1208,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
entry.DataBlk := in_msg.DataBlk;

entry.LastSender := in_msg.Sender;
entry.LastSender := in_msg.Requestor;
}
}
}
Expand Down Expand Up @@ -1397,22 +1400,38 @@ machine(MachineType:Directory, "AMD Baseline protocol")
p_popRequestQueue;
}

transition(U, VicDirty, BL) {L3TagArrayRead} {
transition(U, VicDirty, U) {L3TagArrayRead, L3DataArrayWrite} {
t_allocateTBE;
w_sendResponseWBAck;
d_writeDataToMemory;
al_allocateL3Block;
pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
p_popRequestQueue;
}

transition(U, VicClean, BL) {L3TagArrayRead} {
transition(U, VicClean, U) {L3TagArrayRead, L3DataArrayWrite} {
t_allocateTBE;
w_sendResponseWBAck;
d_writeDataToMemory;
al_allocateL3Block;
pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
p_popRequestQueue;
}

transition(BL, {VicDirty, VicClean}) {
zz_recycleRequestQueue;
}

//CPUData is now deprecated
transition(BL, {CPUData}) {
zz_recycleRequestQueue;
}

//CPUData is now deprecated
transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
d_writeDataToMemory;
al_allocateL3Block;
Expand All @@ -1423,7 +1442,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}

transition(BL, StaleWB, U) {L3TagArrayWrite} {
dt_deallocateTBE;
wada_wakeUpAllDependentsAddr;
pr_popResponseQueue;
}

transition(U, StaleWB, U) {L3TagArrayWrite} {
wada_wakeUpAllDependentsAddr;
pr_popResponseQueue;
}
Expand Down

0 comments on commit d8c01e0

Please sign in to comment.