Skip to content
This repository has been archived by the owner on Nov 22, 2022. It is now read-only.

implemented bottleneck separable convolutions #855

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 146 additions & 34 deletions pytext/models/representations/deepcnn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import Optional

import torch
import torch.nn as nn
Expand Down Expand Up @@ -34,39 +33,164 @@ class SeparableConv1d(nn.Module):
convolutions -- a depthwise and pointwise convolution.

The depthwise convolution spatially convolves each input channel separately,
then the pointwise convolution projects thie result into a new channel space.
then the pointwise convolution projects this result into a new channel space.
This process reduces the number of FLOPS used to compute a convolution and also
exhibits a regularization effect. The general behavior -- including the input
parameters -- is equivalent to `nn.Conv1d`.

`bottleneck` controls the behavior of the pointwise convolution. Instead of
upsampling directly, we split the pointwise convolution into two pieces: the first
convolution downsamples into a (sufficiently small) low dimension and the
second convolution upsamples into the target (higher) dimension. Creating this
bottleneck significantly cuts the number of parameters with minimal loss
in performance.

"""

def __init__(
self,
input_channels: int,
output_channels: int,
kernel_size: int,
padding: Optional[int],
dilation: Optional[int],
padding: int,
dilation: int,
bottleneck: int,
):
super(SeparableConv1d, self).__init__()

self.conv = nn.Sequential(
conv_layers = [
nn.Conv1d(
input_channels,
input_channels,
kernel_size,
padding=padding,
dilation=dilation,
groups=input_channels,
),
nn.Conv1d(input_channels, output_channels, 1),
)
)
]

if bottleneck > 0:
conv_layers.extend(
[
nn.Conv1d(input_channels, bottleneck, 1),
nn.Conv1d(bottleneck, output_channels, 1),
]
)
else:
conv_layers.append(nn.Conv1d(input_channels, output_channels, 1))

self.conv = nn.Sequential(*conv_layers)

def forward(self, x):
return self.conv(x)


def create_conv_package(
index: int,
activation: Activation,
in_channels: int,
out_channels: int,
kernel_size: int,
causal: bool,
dilated: bool,
separable: bool,
bottleneck: int,
weight_norm: bool,
):
"""
Creates a convolutional layer with the specified arguments.

Args:
index (int): Index of a convolutional layer in the stack.
activation (Activation): Activation function.
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_size (int): Size of 1d convolutional filter.
causal (bool): Whether the convolution is causal or not. If set, it
accounts for the temporal ordering of the inputs.
dilated (bool): Whether the convolution is dilated or not. If set,
the receptive field of the convolutional stack grows exponentially.
separable (bool): Whether to use depthwise separable convolutions
or not -- see `SeparableConv1d`.
bottleneck (int): Bottleneck channel dimension for depthwise separable
convolutions. See `SeparableConv1d` for an in-depth explanation.
weight_norm (bool): Whether to add weight normalization to the
regular convolutions or not.

"""

if not separable and bottleneck > 0:
raise RuntimeError(
"Bottleneck layers can only be used with separable convolutions"
)

if separable and weight_norm:
raise RuntimeError(
"Weight normalization is not supported for separable convolutions"
)

def _compute_dilation(index, dilated):
"""
If set, the dilation factor increases by a factor of two for each
successive convolution to increase the receptive field exponentially.

"""

if dilated:
return 2 ** index
return 1

def _compute_padding(kernel_size, dilation, causal):
"""
Non-causal convolutions are centered, so they will consume ((k - 1) // 2) * d
padding on both the left and the right of the sequence. Causal convolutions
are shifted to the left (to account for temporal ordering), so they will
only consume padding from the left. Therefore, we pad this side with the
full amount (k - 1) * d and remove the excess right-padding with `Trim1d`.

"""

if causal:
return (kernel_size - 1) * dilation
return ((kernel_size - 1) // 2) * dilation

def _compute_out_channels(out_channels, activation):
"""
Gated Linear Unit (GLU) activations train two groups of convolutions,
then linearly combine their outputs through a gating mechanism. We
double the number of `out_channels` to mimic these two groups.

"""

if activation == Activation.GLU:
return out_channels * 2
return out_channels

package = []
dilation = _compute_dilation(index, dilated)
padding = _compute_padding(kernel_size, dilation, causal)
out_channels = _compute_out_channels(out_channels, activation)

if separable:
package.append(
SeparableConv1d(
in_channels, out_channels, kernel_size, padding, dilation, bottleneck
)
)
else:
conv = nn.Conv1d(
in_channels, out_channels, kernel_size, padding=padding, dilation=dilation
)
if weight_norm:
conv = nn.utils.weight_norm(conv)
package.append(conv)

if causal:
package.append(Trim1d(padding))

return package[0] if len(package) == 1 else nn.Sequential(*package)


class DeepCNNRepresentation(RepresentationBase):
"""
`DeepCNNRepresentation` implements CNN representation layer
Expand All @@ -85,6 +209,7 @@ class Config(RepresentationBase.Config):
dropout: float = 0.3
activation: Activation = Activation.GLU
separable: bool = False
bottleneck: int = 0

def __init__(self, config: Config, embed_dim: int) -> None:
super().__init__(config)
Expand All @@ -97,9 +222,9 @@ def __init__(self, config: Config, embed_dim: int) -> None:

activation = config.activation
separable = config.separable
bottleneck = config.bottleneck

conv_layers = []
trim_layers = []
linear_layers = []
in_channels = embed_dim

Expand All @@ -113,34 +238,23 @@ def __init__(self, config: Config, embed_dim: int) -> None:
)
linear_layers.append(proj)

dilation = 2 ** i if dilated else 1
padding = (k - 1) * dilation if causal else ((k - 1) // 2) * dilation
conv_layer = SeparableConv1d if separable else nn.Conv1d

single_conv = conv_layer(
in_channels,
(out_channels * 2 if activation == Activation.GLU else out_channels),
k,
padding=padding,
dilation=dilation,
)
single_conv = (
nn.utils.weight_norm(single_conv) if weight_norm else single_conv
single_conv = create_conv_package(
index=i,
activation=activation,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=k,
causal=causal,
dilated=dilated,
separable=separable,
bottleneck=bottleneck,
weight_norm=weight_norm,
)
conv_layers.append(single_conv)

# Non-causal convolutions are centered, so they will consume
# ((k - 1) // 2) * d padding on both the left and the right of the sequence.
# Causal convolutions are shifted to the left (to account for temporal
# ordering), so they will only consume padding from the left. Therefore,
# we pad this side with the full amount (k - 1) * d.
trim = Trim1d(padding) if causal else None
trim_layers.append(trim)

in_channels = out_channels

self.convs = nn.ModuleList(conv_layers)
self.trims = nn.ModuleList(trim_layers)
self.projections = nn.ModuleList(linear_layers)
self.activation = get_activation(activation)

Expand All @@ -151,15 +265,13 @@ def forward(self, inputs: torch.Tensor, *args) -> torch.Tensor:
inputs = self.dropout(inputs)
# bsz * seq_len * embed_dim -> bsz * embed_dim * seq_len
words = inputs.permute(0, 2, 1)
for conv, trim, proj in zip(self.convs, self.trims, self.projections):
for conv, proj in zip(self.convs, self.projections):
if proj:
tranposed = words.permute(0, 2, 1)
residual = proj(tranposed).permute(0, 2, 1)
else:
residual = words
words = conv(words)
if trim:
words = trim(words)
words = self.activation(words)
words = (words + residual) * math.sqrt(0.5)
return words.permute(0, 2, 1)