# Peer-to-Peer (P2P) with 2x NVIDIA RTX 4090

- cloud provider: [runpod.io](https://www.runpod.io/)
- pod: 2 x RTX 4090 (25 vCPU 200 GB RAM)
- image: `runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04`

### nvidia-smi


In [1]:
!nvidia-smi

Thu Mar 28 11:07:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   23C    P8              15W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:C1:00.0 Off |  

In [2]:
!nvidia-smi topo -m

	[4mGPU0	GPU1	CPU Affinity	NUMA Affinity	GPU NUMA ID[0m
GPU0	 X 	SYS	0-63	0		N/A
GPU1	SYS	 X 	0-63	0		N/A

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks


### Building [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/master)

In [3]:
!cd ../nccl-tests/ && make

make -C src build BUILDDIR=/root/p2p-perf/nccl-tests/build
make[1]: Entering directory '/root/p2p-perf/nccl-tests/src'
Compiling  timer.cc                            > /root/p2p-perf/nccl-tests/build/timer.o
Compiling /root/p2p-perf/nccl-tests/build/verifiable/verifiable.o


Compiling  all_reduce.cu                       > /root/p2p-perf/nccl-tests/build/all_reduce.o
Compiling  common.cu                           > /root/p2p-perf/nccl-tests/build/common.o
Linking  /root/p2p-perf/nccl-tests/build/all_reduce.o > /root/p2p-perf/nccl-tests/build/all_reduce_perf
Compiling  all_gather.cu                       > /root/p2p-perf/nccl-tests/build/all_gather.o
Linking  /root/p2p-perf/nccl-tests/build/all_gather.o > /root/p2p-perf/nccl-tests/build/all_gather_perf
Compiling  broadcast.cu                        > /root/p2p-perf/nccl-tests/build/broadcast.o
Linking  /root/p2p-perf/nccl-tests/build/broadcast.o > /root/p2p-perf/nccl-tests/build/broadcast_perf
Compiling  reduce_scatter.cu                   > /root/p2p-perf/nccl-tests/build/reduce_scatter.o
Linking  /root/p2p-perf/nccl-tests/build/reduce_scatter.o > /root/p2p-perf/nccl-tests/build/reduce_scatter_perf
Compiling  reduce.cu                           > /root/p2p-perf/nccl-tests/build/reduce.o
Linking  /root/p2p-

### Running all_reduce_perf

In [7]:
!../nccl-tests/build/all_reduce_perf -b 8 -e 128M -f 2 -g 2

# nThread 1 nGpus 2 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid   3543 on 5a30cdebc0c3 device  0 [0x01] NVIDIA GeForce RTX 4090
#  Rank  1 Group  0 Pid   3543 on 5a30cdebc0c3 device  1 [0xc1] NVIDIA GeForce RTX 4090


#
#                                                              out-of-place                       in-place          
#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
           8             2     float     sum      -1    10.64    0.00    0.00      0    10.94    0.00    0.00      0
          16             4     float     sum      -1     9.89    0.00    0.00      0    10.07    0.00    0.00      0
          32             8     float     sum      -1     9.81    0.00    0.00      0    10.29    0.00    0.00      0
          64            16     float     sum      -1     9.97    0.01    0.01      0     9.78    0.01    0.01      0
         128            32     float     sum      -1     9.74    0.01    0.01      0     9.89    0.01    0.01      0
         256            64     float     sum      -1     9.99 

In [8]:
import torch
import torch.utils.benchmark as benchmark

device0 = torch.device("cuda", 0)
device1 = torch.device("cuda", 1)

x0 = torch.randn(1024, 1024, 1024, dtype=torch.float32, device=device0)
x1 = torch.randint(0,100, (1024, 1024, 1024), dtype=torch.long, device=device1)

def copy_tensor(x, dest_device):
    y = x.to(dest_device, non_blocking=False, copy=False)
    return y

t0 = benchmark.Timer(
    stmt='copy_tensor(x0, device1)',
    setup='from __main__ import copy_tensor',
    globals={'x0': x0, 'device1': device1},
    num_threads=1)

t1 = benchmark.Timer(
    stmt='copy_tensor(x1, device1)',
    setup='from __main__ import copy_tensor',
    globals={'x1': x1, 'device1': device0},
    num_threads=1)

# sanity check
s0 = x0.sum().cpu()
y1 = copy_tensor(x0, device1)
s1 = y1.sum().cpu()
assert torch.abs(s0-s1) < 1e-5

m0 = t0.timeit(100)
storage_size0 = x0.untyped_storage().size()
print(f"{device0}->{device1}: {storage_size0/m0.mean/2**30:.3f} GB/s")

m1 = t1.timeit(100)
storage_size1 = x1.untyped_storage().size()
print(f"{device1}->{device0}: {storage_size1/m1.mean/2**30:.3f} GB/s")

cuda:0->cuda:1: 20.412 GB/s
cuda:1->cuda:0: 20.614 GB/s


In [9]:
!OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc-per-node 2 torch_distributed_nccl_test.py

rank=1 recv N=50, elapsed_time=63.7232s, 3.139 GB/s, sum=10155.72265625 (cuda:1)
rank=0 send N=50, elapsed_time=63.7232s, 3.139 GB/s, sum=10155.72265625 (cuda:0)
rank=0 broadcast(a, src=0) N=50, elapsed_time=64.0530s, 3.122 GB/s, sum=10155.72265625 (cuda:0)
rank=0 broadcast(a, src=1) N=50, elapsed_time=51.9199s, 3.852 GB/s, sum=10155.72265625 (cuda:0)


In [10]:
torch.__version__

'2.2.0+cu121'

### Running cuda-samples simpleP2P and p2pBandwidthLatencyTest

In [11]:
!cd ../cuda-samples/Samples/0_Introduction/simpleP2P/ && make
!../cuda-samples/Samples/0_Introduction/simpleP2P/simpleP2P

/usr/local/cuda/bin/nvcc -ccbin g++ -I../../../Common -m64 --threads 0 --std=c++11 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o simpleP2P.o -c simpleP2P.cu
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o simpleP2P simpleP2P.o 
mkdir -p ../../..

In [12]:
!cd ../cuda-samples/Samples/5_Domain_Specific/p2pBandwidthLatencyTest && make
!../cuda-samples/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest 

/usr/local/cuda/bin/nvcc -ccbin g++ -I../../../Common -m64 --threads 0 --std=c++11 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o p2pBandwidthLatencyTest.o -c p2pBandwidthLatencyTest.cu
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o p2pBandwidth