In [3]:
# sequence networks: transformer, lstm, cnn

In [6]:
import torch.nn as nn

In [11]:
class cnn(nn.Module):
    def __init__(self, filters, kernel_size, layers, embedding, vocab_size, seq_len):
        super(cnn, self).__init__()
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.filters = filters
        self.kernel_size = kernel_size
        layers = layers
        self.embedding = embedding
        self.bed = nn.Embedding(self.vocab_size, self.embedding)
        self.block1 = self.__block__(self.embedding, self.filters, ks=self.kernel_size,
                                     pad=int((self.kernel_size - 1) / 2), drop=0.1)
        self.block2 = self.__block__(self.filters, self.filters * 2, ks=self.kernel_size,
                                     pad=int((self.kernel_size - 1) / 2), drop=0.1)
        self.block3 = self.__block__(self.filters * 2, self.filters * 3, ks=self.kernel_size,
                                     pad=int((self.kernel_size - 1) / 2), drop=0.0)

        # Pooling makes our detection of features sequence position invariant
        self.pool = nn.AdaptiveMaxPool1d(1)

        dense = [self.__ff_block__() for i in range(layers - 1)]
        self.dense = nn.Sequential(*dense, self.__ff_block__(True))

    def __block__(self, f_in, f_out, ks, pad, drop):
        return nn.Sequential(
            # nn.BatchNorm1d(f_in),
            nn.ReLU(),
            nn.Conv1d(f_in, f_out, kernel_size=ks, padding=pad)
        )

    def __ff_block__(self, final=False):
        if final:
            return nn.Sequential(
                # nn.BatchNorm1d(self.embedding),
                nn.ReLU(),
                nn.Linear(self.filters * 3, self.seq_len)
            )
        return nn.Sequential(
            # nn.BatchNorm1d(self.embedding),
            nn.ReLU(),
            nn.Linear(self.filters * 3, self.filters * 3)
        )

    def forward(self, x):
        # print(x.shape)
        x = self.bed(x)
        # print("embedded", x.shape)
        x = x.permute(0, 2, 1)
        # print("permutation", x.shape)
        x = self.block1(x)
        # print("conv1", x.shape)
        x = self.block2(x)
        # print("conv2", x.shape)
        x = self.block3(x)
        # print("conv3", x.shape)
        x = self.pool(x)
        # print("pool", x.shape)
        x = x.view(x.size(0), -1)
        x = self.dense(x)
        # print("out", x.shape)
        return x

In [10]:
class lstm(nn.Module):
    def __init__(self, hidden, rnn_layers, vocab_size):
        super(lstm, self).__init__()
        self.specs = specs

        # Dimensions
        self._input_dim = vocab_size
        self._hidden_dim = hidden
        self._output_dim = vocab_size

        # Number of LSTM layers
        self._layers = rnn_layers

        # LSTM mod
        self._lstm = nn.LSTM(input_size=self._input_dim, hidden_size=self._hidden_dim, num_layers=self._layers,
                             dropout=0.3)

        # All weights initialized with xavier uniform
        nn.init.xavier_uniform_(self._lstm.weight_ih_l0)
        nn.init.xavier_uniform_(self._lstm.weight_ih_l1)
        nn.init.orthogonal_(self._lstm.weight_hh_l0)
        nn.init.orthogonal_(self._lstm.weight_hh_l1)

        # Bias initialized with zeros expect the bias of the forget gate
        self._lstm.bias_ih_l0.data.fill_(0.0)
        self._lstm.bias_ih_l0.data[self._hidden_dim:2 * self._hidden_dim].fill_(1.0)

        self._lstm.bias_ih_l1.data.fill_(0.0)
        self._lstm.bias_ih_l1.data[self._hidden_dim:2 * self._hidden_dim].fill_(1.0)

        self._lstm.bias_hh_l0.data.fill_(0.0)
        self._lstm.bias_hh_l0.data[self._hidden_dim:2 * self._hidden_dim].fill_(1.0)

        self._lstm.bias_hh_l1.data.fill_(0.0)
        self._lstm.bias_hh_l1.data[self._hidden_dim:2 * self._hidden_dim].fill_(1.0)

        # Batch normalization (Weights initialized with one and bias with zero)
        self._norm_0 = nn.LayerNorm(self._input_dim, eps=.001)
        self._norm_1 = nn.LayerNorm(self._hidden_dim, eps=.001)

        # Separate linear model for forward and backward computation
        self._wforward = nn.Linear(self._hidden_dim, self._output_dim)
        nn.init.xavier_uniform_(self._wforward.weight)
        self._wforward.bias.data.fill_(0.0)

    def _init_hidden(self, batch_size, device):
        return (torch.zeros(self._layers, batch_size, self._hidden_dim).to(device),
                torch.zeros(self._layers, batch_size, self._hidden_dim).to(device))

    def new_sequence(self, batch_size=1, device="cpu"):
        return self._init_hidden(batch_size, device)

    def forward(self, input, hidden):
        # Normalization over encoding dimension
        norm_0 = self._norm_0(input)

        # Compute LSTM unit
        out, hidden = self._lstm(norm_0, hidden)

        # Normalization over hidden dimension
        norm_1 = self._norm_1(out)

        # Linear unit forward prediction
        forward = self._wforward(norm_1)

        return forward, hidden