facebookresearch · blefaudeux · May 10, 2022 · Apr 7, 2022 · Apr 7, 2022 · Apr 8, 2022
diff --git a/xformers/components/activations.py b/xformers/components/activations.py
@@ -16,6 +16,7 @@ class Activation(str, Enum):
     GeLU = "gelu"
     LeakyReLU = "leaky_relu"
     ReLU = "relu"
+    SmeLU = "smelu"
 
 
 # For unit testing / parity comparisons, probably not the fastest way
@@ -28,6 +29,24 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x_ * x_
 
 
+class SmeLU(nn.Module):
+    def __init__(self, beta: float = 2.0) -> None:
+        super().__init__()
+        self.beta = beta
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        relu = torch.where(
+            x >= self.beta,
+            x,
+            torch.tensor([0.0], device=x.device, dtype=x.dtype),
+        )
+        return torch.where(
+            torch.abs(x) <= self.beta,
+            ((x + self.beta) ** 2) / (4.0 * self.beta),
+            relu,
+        )
+
+
 class Passthrough(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -45,4 +64,5 @@ def build_activation(activation: Optional[Activation]):
         Activation.GeLU: nn.GELU,
         Activation.LeakyReLU: nn.LeakyReLU,
         Activation.SquaredReLU: SquaredReLU,
+        Activation.SmeLU: SmeLU,
     }[activation]()
diff --git a/xformers/triton/k_activations.py b/xformers/triton/k_activations.py
@@ -21,6 +21,7 @@ def get_triton_activation_kernel(activation: Optional[Activation]):
             Activation.LeakyReLU: leaky_relu,
             Activation.GeLU: gelu,
             Activation.SquaredReLU: squared_relu,
+            Activation.SmeLU: smelu,
         }[activation]
         if activation
         else None
@@ -34,6 +35,7 @@ def get_triton_activation_bwd_kernel(activation: Optional[Activation]):
             Activation.LeakyReLU: leaky_relu_grad,
             Activation.GeLU: gelu_grad,
             Activation.SquaredReLU: squared_relu_grad,
+            Activation.SmeLU: smelu_grad,
         }[activation]
         if activation
         else None
@@ -135,3 +137,29 @@ def gelu_grad(x):
     return 0.5 * x * (
         (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)
     ) + 0.5 * (1 + tanh_out)
+
+
+@triton.jit
+def smelu(x, beta=2.0):
+    """
+    SmeLU_ activation -  Smooth ReLU
+
+    .. _SmeLU: https://arxiv.org/pdf/2202.06499.pdf
+    """
+    zero = 0.0
+    four = 4.0
+    beta = beta.to(x.dtype)
+    output = (x + beta) * (x + beta) / (four.to(x.dtype) * beta)
+    relu = tl.where(x >= beta, x, zero.to(x.dtype))
+    return tl.where(tl.abs(x) <= beta, output, relu)
+
+
+@triton.jit
+def smelu_grad(x, beta=2.0):
+    zero = 0.0
+    one = 1.0
+    two = 2.0
+    beta = beta.to(x.dtype)
+    grad = (beta + x) / (two.to(x.dtype) * beta)
+    relu_grad = tl.where(x >= beta, one.to(x.dtype), zero.to(x.dtype))
+    return tl.where(tl.abs(x) <= beta, grad, relu_grad)