In [1]:
!pip install Ninja



In [2]:
import os
import math

import torch
from torch.nn import functional as F
from torch.utils.cpp_extension import load_inline

In [8]:
import os
import math

import torch
from torch.nn import functional as F
from torch.utils.cpp_extension import load_inline

cuda_src = '''
#include <cuda_fp16.h>
#include <torch/extension.h>
__global__
void forward_kernel(half* A, half* B, half* O, int n, int m) {
    int x = blockIdx.x;
    int y = blockIdx.y * blockDim.x + threadIdx.x;

    if (x < m) {
        O[x*m + y] = __hadd(A[x*m + y], B[x*m + y]);
    }
}

torch::Tensor forward(torch::Tensor A, torch::Tensor B) {
    auto O = torch::zeros_like(A);
    torch::Device device(torch::kCUDA);
    O = O.to(device);

    int n = static_cast<int>(A.size(0));
    int m = static_cast<int>(A.size(1));

    int num_threads = 1024;
    dim3 grid_dim(m, (num_threads + n - 1)/num_threads);
    dim3 block_dim(num_threads);

    forward_kernel<<<grid_dim, block_dim>>>(
        reinterpret_cast<half*>(A.data_ptr<c10::Half>()),
        reinterpret_cast<half*>(B.data_ptr<c10::Half>()),
        reinterpret_cast<half*>(O.data_ptr<c10::Half>()),
        n,
        m
    );

    return O;
}
'''

cpp_src = 'torch::Tensor forward(torch::Tensor A, torch::Tensor B);'

build_dir = 'cuda'
if not os.path.exists(build_dir):
    os.mkdir(build_dir)

os.environ['TORCH_CUDA_ARCH_LIST'] = "7.5"

vec_add = load_inline(
    name='vec_add',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['forward'],
    with_cuda=True,
    extra_cuda_cflags=["-arch=sm_75"],
    build_directory=f'./{build_dir}'
)



=== profiling g_sort === 
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::zeros_like         1.20%     206.227us        21.95%       3.757ms       3.757ms     209.000us         3.47%       6.021ms       6.021ms             1  
              aten::zero_        17.00%       2.909ms        17.35%       2.969ms       2.969ms       2.907ms        48.26%       5.236ms       5.236ms             1  
              aten::fill_         0.18%      30.348us         0.28%      47.873us      47.873us       2.329ms        38.67%       2.3

  with torch.autograd.profiler.profile(use_cuda=True) as prof:


In [12]:
sizes = [1024, 2048, 4096, 8192, 16384, 20000]
seed = 4052
gen = torch.Generator(device='cuda')
gen.manual_seed(seed)

print('=== profiling vector add === ')
for size in sizes:
  print(f"------------ vector add on size {size} ------------------------------------")
  a = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
  b = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
  with torch.autograd.profiler.profile(use_cuda=True) as prof:
    result = vec_add.forward(a, b)
  print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
  ref = a + b
  # print(result)
  # print(ref)
  print('attn values sanity check:', torch.allclose(result, ref, rtol=0, atol=1e-02))

=== profiling vector add === 
------------ vector add on size 1024 ------------------------------------
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::zeros_like        15.67%      32.439us        70.45%     145.813us     145.813us      39.000us        23.64%     153.000us     153.000us             1  
              aten::zero_         7.99%      16.532us        27.15%      56.199us      56.199us      18.000us        10.91%      61.000us      61.000us             1  
         aten::empty_like         9.37%      19.386us  

  with torch.autograd.profiler.profile(use_cuda=True) as prof:


attn values sanity check: True
