In [1]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

In [2]:
epsilon = 1e-6

In [26]:
# https://abdulkaderhelwan.medium.com/swiglu-activation-function-77627e0b2b52

# Tanh

In [4]:
def tanh(x):
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))

In [5]:
x = torch.randn(5, requires_grad=False)
y = nn.Tanh()(x).numpy()
y_ = tanh(x.numpy())
print(y-y_)

[-6.7055225e-08 -5.9604645e-08  5.9604645e-08  5.9604645e-08
 -1.1175871e-08]


# Sigmoid

In [6]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

In [7]:
x = torch.randn(5, requires_grad=False)
y = nn.Sigmoid()(x).numpy()
y_ = sigmoid(x.numpy())
print(y-y_)

[ 5.9604645e-08 -2.9802322e-08  0.0000000e+00  0.0000000e+00
  0.0000000e+00]


# SoftMax

In [8]:
def softmax(x, axis=None):
    return np.exp(x)/np.exp(x).sum(axis=axis, keepdims=True)

In [9]:
x = torch.randn(5, 3, requires_grad=False)
y = nn.Softmax()(x).numpy()
y_ = softmax(x.numpy(), axis=1)
print(y-y_)

[[ 3.7252903e-09 -7.4505806e-09  5.9604645e-08]
 [ 0.0000000e+00  2.9802322e-08  0.0000000e+00]
 [ 0.0000000e+00  0.0000000e+00 -3.7252903e-09]
 [ 0.0000000e+00  2.9802322e-08 -2.9802322e-08]
 [ 0.0000000e+00  0.0000000e+00  2.9802322e-08]]


  return self._call_impl(*args, **kwargs)


# ReLU

In [10]:
def relu(x):
    return np.maximum(x, np.zeros_like(x))

In [11]:
x = torch.randn(5, requires_grad=False)
y = nn.ReLU()(x).numpy()
y_ = relu(x.numpy())
print(y-y_)

[0. 0. 0. 0. 0.]


# GLU

In [12]:
def glu(x, v):
    """
    Advantages:
        Learnable Gating: GLU allows part of the network to act as a gate (through the sigmoid function), controlling the flow of information dynamically.
        Parameter Efficiency: It introduces a gating mechanism, similar to LSTMs, without adding too much complexity or computation.
    """
    return x * (1 / (1 + np.exp(-v)))

In [13]:
x = np.array([1, 2, 3, 4, 5])
v = np.array([0.5, 1.5, 2.0, 0.1, 1.0])
print(glu(x, v))

[0.62245933 1.63514895 2.64239123 2.09991675 3.65529289]


# GELU

In [14]:
def gelu(x):
    """
    GELU overcomes the limitation of ReLU being non-differentiable at zero.
    Advantages:
        Smoothness: Unlike ReLU, which abruptly changes from 0 to x, GELU provides a smooth transition, which can lead to better optimization dynamics.
        Probabilistic Interpretation: GELU can be interpreted as a probabilistic choice of keeping or discarding values based on their magnitude, which makes it more theoretically grounded than ReLU.
    Disadvantages:
        Computation: GELU is more computationally expensive compared to ReLU and SiLU, as it involves a Gaussian distribution or a tanh approximation.
    """
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))

In [15]:
x = np.array([-1, 0, 1, 2])
print(gelu(x))

[-0.15880801  0.          0.84119199  1.95459769]


# SiLU/Swish

In [20]:
def silu(x, beta):
    """
    Swish is differentiable at zero.
    
    """
    return x / (1 + np.exp(-beta * x))  # Same as Swish

In [22]:
x = np.array([-1, 0, 1, 2])
print(silu(x, 0.1))

[-0.47502081  0.          0.52497919  1.09966799]


# SwiGLU

In [23]:
def swiglu(x, v, beta):
    # Swish applied to x and gate (sigmoid) applied to v
    return (x / (1 + np.exp(-beta * x))) * (1 / (1 + np.exp(-v)))

In [25]:
x = np.array([1, 2, 3, 4, 5])
v = np.array([0.5, 1.5, 2.0, 0.1, 1.0])
print(swiglu(x, v, 0.1))

[0.32677819 0.89906048 1.51790187 1.25719425 2.27527117]
