Skip to content

Commit

Permalink
sendrecv: Test different distances
Browse files Browse the repository at this point in the history
Currenly sendrecv allow to send data only to local peers. Let's introduce distance metric
for peers, so one can test different cicles.

For example ./sendrecv -r -1 will iterate all possible distances,
so all NxN communication routes will be tested only in N iterations.
IMHO this is good diagnostic tool for various network issues.
  • Loading branch information
Dmitry Monakhov committed Sep 23, 2021
1 parent 1cd880b commit 82a344c
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions src/sendrecv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "dist",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}

void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
Expand All @@ -39,7 +39,7 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
int peer = (rank-1+nranks)%nranks;
int peer = (rank-root+nranks)%nranks;
TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer));
CUDACHECK(cudaDeviceSynchronize());
}
Expand All @@ -61,8 +61,8 @@ testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
int recvPeer = (rank-1+nRanks) % nRanks;
int sendPeer = (rank+1) % nRanks;
int recvPeer = (rank-root+nRanks) % nRanks;
int sendPeer = (rank+root) % nRanks;

NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
Expand Down Expand Up @@ -90,6 +90,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
int begin_root, end_root;

if ((int)type != -1) {
type_count = 1;
Expand All @@ -110,10 +111,18 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
run_ops = test_ops;
run_opnames = test_opnames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 1;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}

for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
for (int rr=begin_root; rr<=end_root; rr++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], rr));
}
}
}
return testSuccess;
Expand Down

0 comments on commit 82a344c

Please sign in to comment.