Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/profiler/non determinism #586

Merged
merged 4 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion rtlib/functions/dp_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ void __dp_read(LID lid, ADDR addr, char *var) {
timers->stop_and_add(TimerRegion::STACK_CHECK_READ_ACCESS);
// !TEST

// addAccessInfo(true, lid, var, addr);
int64_t workerID =
((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
AccessInfo &current = tempAddrChunks[workerID][tempAddrCount[workerID]++];
Expand Down
118 changes: 7 additions & 111 deletions rtlib/iFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -694,83 +694,6 @@ string getMemoryRegionIdFromAddr(string fallback, ADDR addr) {
*/
}

void addAccessInfo(bool isRead, LID lid, char *var, ADDR addr) {
timers->start(TimerRegion::ADD_ACCESS_INFO);

#ifdef DP_RTLIB_VERBOSE
cout << "enter addAccessInfo\n";
#endif
int64_t workerID =
((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
numAccesses[workerID]++;
AccessInfo &current = tempAddrChunks[workerID][tempAddrCount[workerID]++];
current.isRead = isRead;
current.lid = lid;
current.var = var;
current.AAvar = getMemoryRegionIdFromAddr(var, addr);
current.addr = addr;
// store loop iteration metadata (last 8 bits for loop id, 1 bit to mark loop
// iteration count as valid, last 7 bits for loop iteration) last 8 bits are
// sufficient, since metadata is only used to check for different iterations,
// not exact values. first 32 bits of current.lid are reserved for metadata
// and thus empty
if (loopStack->size() > 0) {
if (loopStack->size() == 1) {
current.lid = current.lid | (((LID)(loopStack->first().loopID & 0xFF))
<< 56); // add masked loop id

current.lid = current.lid | (((LID)(loopStack->top().count & 0x7F))
<< 48); // add masked loop count
current.lid =
current.lid | (LID)0x0080000000000000; // mark loop count valid
} else if (loopStack->size() == 2) {
current.lid = current.lid | (((LID)(loopStack->first().loopID & 0xFF))
<< 56); // add masked loop id
current.lid = current.lid | (((LID)(loopStack->top().count & 0x7F))
<< 48); // add masked loop count
current.lid =
current.lid | (LID)0x0080000000000000; // mark loop count valid
current.lid = current.lid | (((LID)(loopStack->topMinusN(1).count & 0x7F))
<< 40); // add masked loop count
current.lid =
current.lid | (LID)0x0000800000000000; // mark loop count valid
} else { // (loopStack->size() >= 3)
current.lid = current.lid | (((LID)(loopStack->first().loopID & 0xFF))
<< 56); // add masked loop id
current.lid = current.lid | (((LID)(loopStack->top().count & 0x7F))
<< 48); // add masked loop count
current.lid =
current.lid | (LID)0x0080000000000000; // mark loop count valid
current.lid = current.lid | (((LID)(loopStack->topMinusN(1).count & 0x7F))
<< 40); // add masked loop count
current.lid =
current.lid | (LID)0x0000800000000000; // mark loop count valid
current.lid = current.lid | (((LID)(loopStack->topMinusN(2).count & 0x7F))
<< 32); // add masked loop count
current.lid =
current.lid | (LID)0x0000008000000000; // mark loop count valid
}
} else {
// mark loopID as invalid (0xFF to allow 0 as valid loop id)
current.lid = current.lid | (((LID)0xFF) << 56);
}

if (tempAddrCount[workerID] == CHUNK_SIZE) {
pthread_mutex_lock(&addrChunkMutexes[workerID]);
addrChunkPresent[workerID] = true;
chunks[workerID].push(tempAddrChunks[workerID]);
pthread_cond_signal(&addrChunkPresentConds[workerID]);
pthread_mutex_unlock(&addrChunkMutexes[workerID]);
tempAddrChunks[workerID] = new AccessInfo[CHUNK_SIZE];
tempAddrCount[workerID] = 0;
}
#ifdef DP_RTLIB_VERBOSE
cout << "exit addAccessInfo\n";
#endif

timers->stop_and_add(TimerRegion::ADD_ACCESS_INFO);
}

void mergeDeps() {
depSet *tmp_depSet = nullptr; // pointer to the current processing set of dps
depMap::iterator globalPos; // position of the current processing lid in allDeps
Expand Down Expand Up @@ -833,12 +756,14 @@ void *analyzeDeps(void *arg) {
// analyze data dependences

for (unsigned short i = 0; i < CHUNK_SIZE; ++i) {
timers->start(TimerRegion::ANALYZE_DEPS_INNER);
access = accesses[i];

if (access.isRead) {
// hybrid analysis
if (access.skip) {
SMem->insertToRead(access.addr, access.lid);
timers->stop_and_add(TimerRegion::ANALYZE_DEPS_INNER);
continue;
}
// End HA
Expand Down Expand Up @@ -878,6 +803,7 @@ void *analyzeDeps(void *arg) {
}
}
}
timers->stop_and_add(TimerRegion::ANALYZE_DEPS_INNER);
}

// delete the current chunk at the end
Expand Down Expand Up @@ -973,40 +899,10 @@ void clearStackAccesses(ADDR stack_lower_bound, ADDR stack_upper_bound) {
timers->start(TimerRegion::CLEAR_STACK_ACCESSES);

for (ADDR addr : scopeManager->getCurrentScope().get_first_write()) {
int64_t workerID =
((addr - (addr % 4)) % (NUM_WORKERS * 4)) / 4; // implicit "floor"
// cleanup reads
AccessInfo &cleanupReadCurrent =
tempAddrChunks[workerID][tempAddrCount[workerID]++];
cleanupReadCurrent.addr = addr;
cleanupReadCurrent.lid = 0;
cleanupReadCurrent.isRead = true;

if (tempAddrCount[workerID] == CHUNK_SIZE) {
pthread_mutex_lock(&addrChunkMutexes[workerID]);
addrChunkPresent[workerID] = true;
chunks[workerID].push(tempAddrChunks[workerID]);
pthread_cond_signal(&addrChunkPresentConds[workerID]);
pthread_mutex_unlock(&addrChunkMutexes[workerID]);
tempAddrChunks[workerID] = new AccessInfo[CHUNK_SIZE];
tempAddrCount[workerID] = 0;
}
// cleanup writes
AccessInfo &cleanupWriteCurrent =
tempAddrChunks[workerID][tempAddrCount[workerID]++];
cleanupWriteCurrent.addr = addr;
cleanupWriteCurrent.lid = 0;
cleanupWriteCurrent.isRead = false;

if (tempAddrCount[workerID] == CHUNK_SIZE) {
pthread_mutex_lock(&addrChunkMutexes[workerID]);
addrChunkPresent[workerID] = true;
chunks[workerID].push(tempAddrChunks[workerID]);
pthread_cond_signal(&addrChunkPresentConds[workerID]);
pthread_mutex_unlock(&addrChunkMutexes[workerID]);
tempAddrChunks[workerID] = new AccessInfo[CHUNK_SIZE];
tempAddrCount[workerID] = 0;
}
//cleanup reads
__dp_read(0, addr, "");
//cleanup writes
__dp_write(0, addr, "");
}

timers->stop_and_add(TimerRegion::CLEAR_STACK_ACCESSES);
Expand Down
2 changes: 0 additions & 2 deletions rtlib/iFunctions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ void *analyzeDeps(void *arg);

std::string getMemoryRegionIdFromAddr(std::string fallback, ADDR addr);

void addAccessInfo(bool isRead, LID lid, char *var, ADDR addr);

void finalizeParallelization();

void clearStackAccesses(ADDR stack_lower_bound, ADDR stack_upper_bound);
Expand Down
2 changes: 1 addition & 1 deletion rtlib/iFunctionsTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct AccessInfo {
: isRead(isRead), lid(lid), var(var), AAvar(AAvar), addr(addr),
skip(skip) {}

AccessInfo() : lid(0) {}
AccessInfo() : isRead(false), lid(0), var(""), AAvar(""), addr(0), skip(false) {}

bool isRead;
// hybrid analysis
Expand Down
4 changes: 2 additions & 2 deletions share/include/timer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ enum class TimerRegion : unsigned int {
READ_RUNTIME_INFO,
INIT_PARALLELIZATION,
GET_MEMORY_REGION_ID_FROM_ADDR,
ADD_ACCESS_INFO,
MERGE_DEPS,
ANALYZE_DEPS,
ANALYZE_DEPS_INNER,
FINALIZE_PARALLELIZATION,
CLEAR_STACK_ACCESSES,

Expand Down Expand Up @@ -203,14 +203,14 @@ class Timers {
print(stream, " Add a dependency : ", TimerRegion::ADD_DEP);
print(stream, " Merge dendencies : ", TimerRegion::MERGE_DEPS);
print(stream, " Analyze the dependencies (incorrect! : ", TimerRegion::ANALYZE_DEPS); // Incorrect due to multithreading
print(stream, " Analyze the dependencies (inner) : ", TimerRegion::ANALYZE_DEPS_INNER);
stream << '\n';
print(stream, " Output the dependencies : ", TimerRegion::OUTPUT_DEPS);
print(stream, " Output the loops : ", TimerRegion::OUTPUT_LOOPS);
print(stream, " Output the functions : ", TimerRegion::OUTPUT_FUNCS);
print(stream, " Output the allocations : ", TimerRegion::OUTPUT_ALLOCATIONS);
stream << '\n';
print(stream, " Get memory region by id from address : ", TimerRegion::GET_MEMORY_REGION_ID_FROM_ADDR);
print(stream, " Add access information : ", TimerRegion::ADD_ACCESS_INFO);
print(stream, " Clear the stack accesses : ", TimerRegion::CLEAR_STACK_ACCESSES);
stream << '\n';

Expand Down
Loading