Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
sendrecv: Test different distances
Currenly sendrecv allow to send data only to local peers. Let's introduce distance metric
for peers, so one can test different cicles.

For example ./sendrecv -r -1 will iterate all possible distances,
so all NxN communication routes will be tested only in N iterations.
IMHO this is good diagnostic tool for various network issues.
  • Loading branch information
Dmitry Monakhov committed Sep 23, 2021
1 parent 1cd880b commit 82a344c
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions src/sendrecv.cu
Expand Up @@ -8,15 +8,15 @@
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "dist",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}

void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
Expand All @@ -39,7 +39,7 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
int peer = (rank-1+nranks)%nranks;
int peer = (rank-root+nranks)%nranks;
TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
CUDACHECK(cudaDeviceSynchronize());
}
Expand All @@ -61,8 +61,8 @@ testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
int recvPeer = (rank-1+nRanks) % nRanks;
int sendPeer = (rank+1) % nRanks;
int recvPeer = (rank-root+nRanks) % nRanks;
int sendPeer = (rank+root) % nRanks;

NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
Expand Down Expand Up @@ -90,6 +90,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
int begin_root, end_root;

if ((int)type != -1) {
type_count = 1;
Expand All @@ -110,10 +111,18 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
run_ops = test_ops;
run_opnames = test_opnames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 1;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}

for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
for (int rr=begin_root; rr<=end_root; rr++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], rr));
}
}
}
return testSuccess;
Expand Down

0 comments on commit 82a344c

Please sign in to comment.