In [647]:
import numpy as np
from numpy.typing import ArrayLike


In [648]:
class GELU:
    def __init__(self) -> None:
        self._sqrt_of_2_by_pi = np.sqrt(2 / np.pi)
        self.input = None

    def forward(self, input: ArrayLike) -> np.ndarray:
        self.input = np.asanyarray(input)
        return (
            0.5
            * input
            * (
                1
                + np.tanh(
                    self._sqrt_of_2_by_pi * (input + 0.044715 * np.power(input, 3))
                )
            )
        )

    def backward(self, grad_output: ArrayLike) -> np.ndarray:
        # raise NotImplementedError("Implement the GELU backward path")
        x = self.input
        m1 = self._sqrt_of_2_by_pi
        m2 = 0.044715
        m3 = m1 * (x+m2 * x**3)
        tanhm3 = np.tanh(
                    m3
                )
        first = 0.5 * (
                1
                + tanhm3
            )
        second = x/2 * (1- tanhm3**2) * (m1+2*x**2 * m2*m1)
        grad_out = (first + second) * grad_output
        return grad_out


In [649]:
import torch
Gelu= GELU()
a=np.random.random((5,))

In [650]:
a_torch = torch.tensor(a.copy(),requires_grad=True)
b_t=torch.nn.functional.gelu(a_torch)

b_n = Gelu.forward(a.copy())
b_n_grad = Gelu.backward(np.ones((5,)))

In [651]:
print(b_t.clone().detach().numpy())
print(b_n) 
# close enough

print(b_n_grad)

[0.16532595 0.39514062 0.35139213 0.62540058 0.46784553]
[0.16532423 0.39511543 0.35137409 0.62531816 0.46780561]
[0.71157    0.89818485 0.86939301 1.01158035 0.94042867]


In [652]:
torch_sum = b_t.sum() # the gradient of sum equals the np.ones((5,))
grads = torch.autograd.grad(outputs=[torch_sum],inputs=[a_torch])
print(grads)
diff = (b_n_grad-grads[0].detach().numpy())
print(diff)
assert(abs(diff).max() <= 1e-2)


(tensor([0.7119, 0.9009, 0.8715, 1.0179, 0.9442], dtype=torch.float64),)
[-0.00036793 -0.00267998 -0.00209933 -0.00632983 -0.00374811]
