### Numpy

In [3]:
import numpy as np

In [7]:
def gelu_numpy(x):
  sqrt_2_pi = np.sqrt(2*np.pi)
  return 0.5*x*(1 + np.tanh(sqrt_2_pi*(x+0.044715*(x**3))))

In [8]:
x = np.linspace(-5, 5, 10)
gelu_values = gelu_numpy(x)

# Print results
for i in range(len(x)):
    print(f"GELU({x[i]:.2f}) = {gelu_values[i]:.5f}")

GELU(-5.00) = -0.00000
GELU(-3.89) = -0.00000
GELU(-2.78) = -0.00000
GELU(-1.67) = -0.00014
GELU(-0.56) = -0.03115
GELU(0.56) = 0.52441
GELU(1.67) = 1.66653
GELU(2.78) = 2.77778
GELU(3.89) = 3.88889
GELU(5.00) = 5.00000


### Pytorch

In [9]:
import torch

In [10]:
def gelu_torch(x):
  sqrt_2_pi = torch.sqrt(torch.tensor(2.0*np.pi))
  return 0.5*x*(1 + torch.tanh(sqrt_2_pi*(x+0.044715*(x**3))))

In [12]:
x = np.linspace(-5, 5, 10)
x = torch.tensor(x, dtype=torch.float32)
gelu_values = gelu_torch(x)

# Print results
for i in range(len(x)):
    print(f"GELU({x[i]:.2f}) = {gelu_values[i]:.5f}")

GELU(-5.00) = -0.00000
GELU(-3.89) = -0.00000
GELU(-2.78) = -0.00000
GELU(-1.67) = -0.00014
GELU(-0.56) = -0.03115
GELU(0.56) = 0.52441
GELU(1.67) = 1.66653
GELU(2.78) = 2.77778
GELU(3.89) = 3.88889
GELU(5.00) = 5.00000


In [11]:
!pip install .

Processing /content
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gelu_cuda
  Building wheel for gelu_cuda (setup.py) ... [?25l[?25hdone
  Created wheel for gelu_cuda: filename=gelu_cuda-0.0.0-cp311-cp311-linux_x86_64.whl size=269424 sha256=c8ff57dd6b5045a417ca1ccaf062ee19fe3b8e59397a4a2dcf5e935763fba6c7
  Stored in directory: /tmp/pip-ephem-wheel-cache-_n4cw4ws/wheels/01/d1/e4/ca90c6fac4331f6da6de5353843d0b67505c2bbc8768ac296e
Successfully built gelu_cuda
Installing collected packages: gelu_cuda
Successfully installed gelu_cuda-0.0.0


In [13]:
import time
import numpy as np
import torch
import torch.nn.functional as F
import gelu_cuda

# ✅ Generate input tensor
size = 10_000_000  # 10M elements
x_cpu = np.random.randn(size).astype(np.float32)
x_torch_cpu = torch.tensor(x_cpu, device="cpu")
x_torch_gpu = torch.tensor(x_cpu, device="cuda")

start = time.time()
gelu_np = 0.5 * x_cpu * (1 + np.tanh(np.sqrt(2/np.pi) * (x_cpu + 0.044715 * x_cpu**3)))
numpy_time = time.time() - start
print(f"NumPy GELU time: {numpy_time:.6f} sec")

start = time.time()
gelu_torch_cpu = F.gelu(x_torch_cpu)
torch_cpu_time = time.time() - start
print(f"PyTorch GELU (CPU) time: {torch_cpu_time:.6f} sec")

start = time.time()
gelu_torch_cuda = F.gelu(x_torch_gpu)
torch_cuda_time = time.time() - start
print(f"PyTorch GELU (CUDA) time: {torch_cuda_time:.6f} sec")

start = time.time()
gelu_custom_cuda = gelu_cuda.gelu_cuda(x_torch_gpu)
custom_cuda_time = time.time() - start
print(f"Our CUDA GELU time: {custom_cuda_time:.6f} sec")

print(f"\nSpeedup over NumPy: {numpy_time / custom_cuda_time:.2f}x")
print(f"Speedup over Torch CPU: {torch_cpu_time / custom_cuda_time:.2f}x")
print(f"Speedup over Torch CUDA: {torch_cuda_time / custom_cuda_time:.2f}x")


NumPy GELU time: 0.339315 sec
PyTorch GELU (CPU) time: 0.076992 sec
PyTorch GELU (CUDA) time: 0.036650 sec
Our CUDA GELU time: 0.001127 sec

Speedup over NumPy: 300.95x
Speedup over Torch CPU: 68.29x
Speedup over Torch CUDA: 32.51x
