In [4]:
from IPython.display import Image

In [14]:
# https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc
Image(url='https://developer-blogs.nvidia.com/wp-content/uploads/2012/12/pinned-1024x541.jpg', 
      width=400)

## basics

- Host (CPU)
    - pinned memory 定义在 host（cpu）上；
- HtoD: host to device
- DtoH: device to host

As you can see in the figure, pinned memory is used as a staging area for transfers from the device to the host. We can avoid the cost of the transfer between pageable and pinned host arrays by directly allocating our host arrays in pinned memory. 

In [8]:
import torch
import time

### host to device

In [9]:
# 创建一个大的Tensor以便看到明显的时间差异
size = (10000, 10000)

# 普通内存Tensor
normal_tensor = torch.FloatTensor(*size)
# 将普通Tensor复制到GPU并计时
t0 = time.time()
normal_tensor_gpu = normal_tensor.to("cuda")
time.time() - t0

0.1719825267791748

In [10]:
# Pinned内存Tensor
pinned_tensor = torch.FloatTensor(*size).pin_memory()
# 将Pinned Tensor复制到GPU并计时
t0 = time.time()
pinned_tensor_gpu = pinned_tensor.to("cuda", non_blocking=True)
time.time() - t0

0.0002377033233642578

### device to host

In [11]:
size = (10000, 10000)
gpu_tensor = torch.randn(*size, device="cuda")

# 复制到普通内存并计时
t0 = time.time()
normal_tensor_cpu = gpu_tensor.to("cpu")
time.time() - t0

0.3526153564453125

In [13]:
# 为了使用pinned memory，首先在CPU上创建一个pinned memory Tensor
pinned_tensor_cpu = torch.randn(*size).pin_memory()

# 确保GPU操作完成
torch.cuda.synchronize()

# 使用非阻塞方式复制到Pinned内存并计时
t0 = time.time()
pinned_tensor_cpu.copy_(gpu_tensor, non_blocking=True)
torch.cuda.synchronize()  # 等待数据传输完成
time.time() - t0

0.024584054946899414

## cuda 编程

- https://github.com/NVIDIA-developer-blog/code-samples.git
    - code-samples/series/cuda-cpp/optimize-data-transfers/bandwidthtest.cu

```
$ nvcc bandwidthtest.cu -o a.out
$ ./a.out
```

```
Device: NVIDIA GeForce RTX 4090
Transfer size (MB): 16

Pageable transfers
  Host to Device bandwidth (GB/s): 5.959241
  Device to Host bandwidth (GB/s): 5.124604

Pinned transfers
  Host to Device bandwidth (GB/s): 13.453977
  Device to Host bandwidth (GB/s): 13.369578
```