facebookresearch · shreydesai · Aug 4, 2019
diff --git a/pytext/models/representations/deepcnn.py b/pytext/models/representations/deepcnn.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -34,39 +33,164 @@ class SeparableConv1d(nn.Module):
     convolutions -- a depthwise and pointwise convolution.
 
     The depthwise convolution spatially convolves each input channel separately,
-    then the pointwise convolution projects thie result into a new channel space.
+    then the pointwise convolution projects this result into a new channel space.
     This process reduces the number of FLOPS used to compute a convolution and also
     exhibits a regularization effect. The general behavior -- including the input
     parameters -- is equivalent to `nn.Conv1d`.
 
+    `bottleneck` controls the behavior of the pointwise convolution. Instead of
+    upsampling directly, we split the pointwise convolution into two pieces: the first
+    convolution downsamples into a (sufficiently small) low dimension and the
+    second convolution upsamples into the target (higher) dimension. Creating this
+    bottleneck significantly cuts the number of parameters with minimal loss
+    in performance.
+
     """
 
     def __init__(
         self,
         input_channels: int,
         output_channels: int,
         kernel_size: int,
-        padding: Optional[int],
-        dilation: Optional[int],
+        padding: int,
+        dilation: int,
+        bottleneck: int,
     ):
         super(SeparableConv1d, self).__init__()
 
-        self.conv = nn.Sequential(
+        conv_layers = [
             nn.Conv1d(
                 input_channels,
                 input_channels,
                 kernel_size,
                 padding=padding,
                 dilation=dilation,
                 groups=input_channels,
-            ),
-            nn.Conv1d(input_channels, output_channels, 1),
-        )
+            )
+        ]
+
+        if bottleneck > 0:
+            conv_layers.extend(
+                [
+                    nn.Conv1d(input_channels, bottleneck, 1),
+                    nn.Conv1d(bottleneck, output_channels, 1),
+                ]
+            )
+        else:
+            conv_layers.append(nn.Conv1d(input_channels, output_channels, 1))
+
+        self.conv = nn.Sequential(*conv_layers)
 
     def forward(self, x):
         return self.conv(x)
 
 
+def create_conv_package(
+    index: int,
+    activation: Activation,
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    causal: bool,
+    dilated: bool,
+    separable: bool,
+    bottleneck: int,
+    weight_norm: bool,
+):
+    """
+    Creates a convolutional layer with the specified arguments.
+
+    Args:
+        index (int): Index of a convolutional layer in the stack.
+        activation (Activation): Activation function.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int): Size of 1d convolutional filter.
+        causal (bool): Whether the convolution is causal or not. If set, it
+        accounts for the temporal ordering of the inputs.
+        dilated (bool): Whether the convolution is dilated or not. If set,
+        the receptive field of the convolutional stack grows exponentially.
+        separable (bool): Whether to use depthwise separable convolutions
+        or not -- see `SeparableConv1d`.
+        bottleneck (int): Bottleneck channel dimension for depthwise separable
+        convolutions. See `SeparableConv1d` for an in-depth explanation.
+        weight_norm (bool): Whether to add weight normalization to the
+        regular convolutions or not.
+
+    """
+
+    if not separable and bottleneck > 0:
+        raise RuntimeError(
+            "Bottleneck layers can only be used with separable convolutions"
+        )
+
+    if separable and weight_norm:
+        raise RuntimeError(
+            "Weight normalization is not supported for separable convolutions"
+        )
+
+    def _compute_dilation(index, dilated):
+        """
+        If set, the dilation factor increases by a factor of two for each
+        successive convolution to increase the receptive field exponentially.
+
+        """
+
+        if dilated:
+            return 2 ** index
+        return 1
+
+    def _compute_padding(kernel_size, dilation, causal):
+        """
+        Non-causal convolutions are centered, so they will consume ((k - 1) // 2) * d
+        padding on both the left and the right of the sequence. Causal convolutions
+        are shifted to the left (to account for temporal ordering), so they will
+        only consume padding from the left. Therefore, we pad this side with the
+        full amount (k - 1) * d and remove the excess right-padding with `Trim1d`.
+
+        """
+
+        if causal:
+            return (kernel_size - 1) * dilation
+        return ((kernel_size - 1) // 2) * dilation
+
+    def _compute_out_channels(out_channels, activation):
+        """
+        Gated Linear Unit (GLU) activations train two groups of convolutions,
+        then linearly combine their outputs through a gating mechanism. We
+        double the number of `out_channels` to mimic these two groups.
+
+        """
+
+        if activation == Activation.GLU:
+            return out_channels * 2
+        return out_channels
+
+    package = []
+    dilation = _compute_dilation(index, dilated)
+    padding = _compute_padding(kernel_size, dilation, causal)
+    out_channels = _compute_out_channels(out_channels, activation)
+
+    if separable:
+        package.append(
+            SeparableConv1d(
+                in_channels, out_channels, kernel_size, padding, dilation, bottleneck
+            )
+        )
+    else:
+        conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=padding, dilation=dilation
+        )
+        if weight_norm:
+            conv = nn.utils.weight_norm(conv)
+        package.append(conv)
+
+    if causal:
+        package.append(Trim1d(padding))
+
+    return package[0] if len(package) == 1 else nn.Sequential(*package)
+
+
 class DeepCNNRepresentation(RepresentationBase):
     """
     `DeepCNNRepresentation` implements CNN representation layer
@@ -85,6 +209,7 @@ class Config(RepresentationBase.Config):
         dropout: float = 0.3
         activation: Activation = Activation.GLU
         separable: bool = False
+        bottleneck: int = 0
 
     def __init__(self, config: Config, embed_dim: int) -> None:
         super().__init__(config)
@@ -97,9 +222,9 @@ def __init__(self, config: Config, embed_dim: int) -> None:
 
         activation = config.activation
         separable = config.separable
+        bottleneck = config.bottleneck
 
         conv_layers = []
-        trim_layers = []
         linear_layers = []
         in_channels = embed_dim
 
@@ -113,34 +238,23 @@ def __init__(self, config: Config, embed_dim: int) -> None:
             )
             linear_layers.append(proj)
 
-            dilation = 2 ** i if dilated else 1
-            padding = (k - 1) * dilation if causal else ((k - 1) // 2) * dilation
-            conv_layer = SeparableConv1d if separable else nn.Conv1d
-
-            single_conv = conv_layer(
-                in_channels,
-                (out_channels * 2 if activation == Activation.GLU else out_channels),
-                k,
-                padding=padding,
-                dilation=dilation,
-            )
-            single_conv = (
-                nn.utils.weight_norm(single_conv) if weight_norm else single_conv
+            single_conv = create_conv_package(
+                index=i,
+                activation=activation,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=k,
+                causal=causal,
+                dilated=dilated,
+                separable=separable,
+                bottleneck=bottleneck,
+                weight_norm=weight_norm,
             )
             conv_layers.append(single_conv)
 
-            # Non-causal convolutions are centered, so they will consume
-            # ((k - 1) // 2) * d padding on both the left and the right of the sequence.
-            # Causal convolutions are shifted to the left (to account for temporal
-            # ordering), so they will only consume padding from the left. Therefore,
-            # we pad this side with the full amount (k - 1) * d.
-            trim = Trim1d(padding) if causal else None
-            trim_layers.append(trim)
-
             in_channels = out_channels
 
         self.convs = nn.ModuleList(conv_layers)
-        self.trims = nn.ModuleList(trim_layers)
         self.projections = nn.ModuleList(linear_layers)
         self.activation = get_activation(activation)
 
@@ -151,15 +265,13 @@ def forward(self, inputs: torch.Tensor, *args) -> torch.Tensor:
         inputs = self.dropout(inputs)
         # bsz * seq_len * embed_dim -> bsz * embed_dim * seq_len
         words = inputs.permute(0, 2, 1)
-        for conv, trim, proj in zip(self.convs, self.trims, self.projections):
+        for conv, proj in zip(self.convs, self.projections):
             if proj:
                 tranposed = words.permute(0, 2, 1)
                 residual = proj(tranposed).permute(0, 2, 1)
             else:
                 residual = words
             words = conv(words)
-            if trim:
-                words = trim(words)
             words = self.activation(words)
             words = (words + residual) * math.sqrt(0.5)
         return words.permute(0, 2, 1)