<a href="https://colab.research.google.com/github/doudi25/Triton/blob/main/SiLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import triton
import triton.language as tl

In [None]:
@triton.jit
def _forward_kernel(x_ptr,y_ptr,stride_m,stride_n,m,n,BLOCK_SIZE_ROW:tl.constexpr,BLOCK_SIZE_COL:tl.constexpr,num_warps=64):
  pid_m = tl.program_id(axis=0)
  pid_n = tl.program_id(axis=1)
  offs_m = pid_m * BLOCK_SIZE_ROW + tl.arange(0,BLOCK_SIZE_ROW)
  offs_n = pid_n * BLOCK_SIZE_COL + tl.arange(0,BLOCK_SIZE_COL)
  # apply mask
  mask = (offs_m[:,None] < m ) & (offs_n[None,:] < n)
  # assign correct accesing to x
  x_ptrs = x_ptr + offs_m[:,None] * stride_m + offs_n[None,:] * stride_n
  x = tl.load(x_ptrs,mask=mask)
  # silu = x * sigmoid(x)
  y = x * tl.sigmoid(x)
  # assing out_ptrs
  y_ptrs = y_ptr + offs_m[:,None] * stride_m + offs_n[None,:] * stride_n
  # store result
  tl.store(y_ptrs,y,mask=mask)

In [None]:
def _forward_Silu(x:torch.tensor):
  assert x.is_cuda and x.is_contiguous()
  if x.ndim == 2:
    m,n = x.shape
  else:
    x = x.view(-1,x.shape[-1])
    m,n = x.shape
  y = torch.empty_like(x,device=x.device,dtype=x.dtype)
  BLOCK_SIZE_ROW = 32
  BLOCK_SIZE_COL = 64
  grid = (triton.cdiv(m,BLOCK_SIZE_ROW),triton.cdiv(n,BLOCK_SIZE_COL))
  _forward_kernel[grid](x,y,x.stride(0),x.stride(1),m,n,BLOCK_SIZE_ROW,BLOCK_SIZE_COL)
  return y

In [None]:
@triton.jit
def _backward_kernel(x_ptr,dx_ptr,dout_ptr,stride_m,stride_n,
                     m,n,BLOCK_SIZE_ROW:tl.constexpr,BLOCK_SIZE_COL:tl.constexpr,num_warps=64):
  pid_m = tl.program_id(axis=0)
  pid_n = tl.program_id(axis=1)
  offs_m = pid_m * BLOCK_SIZE_ROW + tl.arange(0,BLOCK_SIZE_ROW)
  offs_n = pid_n * BLOCK_SIZE_COL + tl.arange(0,BLOCK_SIZE_COL)
  mask = (offs_m[:,None] < m ) & (offs_n[None,:] < n)
  x_ptrs = x_ptr + offs_m[:,None] * stride_m + offs_n[None,:] * stride_n
  x = tl.load(x_ptrs,mask=mask)
  # assign dout ptrs
  dout_ptrs = dout_ptr + offs_m[:,None] * stride_m + offs_n[None,:] * stride_
  # load dout
  dout = tl.load(dout_ptrs,mask=mask)
  # dx = dout * ( sig(x) + dsigmoid(x) * x)
  # dsimgoid(x) = sig(x) * ( 1 - sig(x))
  sig = tl.sigmoid(x)
  dx = dout * ( sig + (sig * ( 1 - sig )) * x)
  # assign dx_ptrs
  dx_ptrs = dx_ptr + offs_m[:,None] * stride_m + offs_n[None,:] * stride_n
  # store gradient
  tl.store(dx_ptrs,dx,mask=mask)



In [None]:
def _backward_Silu(x:torch.tensor,dout:torch.tensor):
  assert x.is_cuda and dout.is_cuda
  # make dout contiguous
  dout = dout.contiguous()
  assert x.is_contiguous() and dout.is_contiguous(),print(f'x is contiguous {x.is_contiguous()} , dout is contiguous {dout.is_contiguous}')
  m,n = x.shape
  dx = torch.empty_like(x,device=x.device,dtype=x.dtype)
  BLOCK_SIZE_ROW = 32
  BLOCK_SIZE_COL = 64
  grid = (triton.cdiv(m,BLOCK_SIZE_ROW),triton.cdiv(n,BLOCK_SIZE_COL))
  _backward_kernel[grid](x,dx,dout,x.stride(0),x.stride(1),m,n,BLOCK_SIZE_ROW,BLOCK_SIZE_COL)
  return dx

In [None]:
class Silu(torch.autograd.Function):
  @staticmethod
  def forward(ctx,input):
    ctx.save_for_backward(input)
    out = _forward_Silu(input)
    return out
  @staticmethod
  def backward(ctx,dout):
    input = ctx.saved_tensors[0]
    dx = _backward_Silu(input,dout)
    return dx

In [None]:
def test_correctness():
  input = torch.rand((1024,2048),device='cuda',requires_grad=True)
  out_torch = torch.nn.functional.silu(input)
  loss = out_torch.sum()
  loss.backward()
  # get the gradient of input using torch autograd enginee and remake it None for triton gradient
  dinput_torch,input.grad = input.grad.clone(),None
  # get output of triton kernel
  out_triton = Silu.apply(input)
  loss = out_triton.sum()
  loss.backward()
  # get the gradient of input using triton backward kernel
  dinput_triton,input.grad = input.grad.clone(),None
  # print result
  return print(torch.allclose(dinput_triton,dinput_torch),"The gradient of triton silu kernel is similar to pytorch autograd engine")

In [None]:
if __name__=="__main__":
  test_correctness()

True The gradient of triton silu kernel is similar to pytorch autograd engine
