Finishing up convseq2seq, yet it still need some adjustments.

gugarosa · Jun 2, 2020 · 6a4a053 · 6a4a053
1 parent 268af6d
commit 6a4a053
Show file tree

Hide file tree

Showing 5 changed files with 317 additions and 4 deletions.
diff --git a/examples/applications/generation/conv_seq2seq_generation.py b/examples/applications/generation/conv_seq2seq_generation.py
@@ -18,13 +18,16 @@
 # Builds the vocabulary
 source.build_vocab(dataset, min_freq=1)
 
+# Gathering the <pad> token index for further ignoring
+pad_index = source.vocab.stoi[source.pad_token]
+
 # Creates an iterator that backpropagates through time
 train_iterator = BPTTIterator(dataset, batch_size=16, bptt_len=10, device=device)
 
 # Creating the ConvSeq2Seq model
 conv_seq2seq = ConvSeq2Seq(n_input=len(source.vocab), n_output=len(source.vocab),
                            n_hidden=512, n_embedding=256, n_layers=1, kernel_size=3,
-                           scale=0.5, ignore_token=None, init_weights=None, device=device)
+                           scale=0.5, ignore_token=pad_index, init_weights=None, device=device)
 
 # Training the model
 conv_seq2seq.fit(train_iterator, epochs=10)

diff --git a/textformer/models/conv_seq2seq.py b/textformer/models/conv_seq2seq.py
@@ -73,3 +73,159 @@ def forward(self, x, y, teacher_forcing_ratio=0.0):
         preds, _ = self.D(y, conv, output)
 
         return preds
+
+    def generate_text(self, start, field, length=10, temperature=1.0):
+        """Generates text by feeding to the network the
+        current token (t) and predicting the next token (t+1).
+
+        Args:
+            field (torchtext.data.Field): Datatype instructions for tensor convertion.
+            start (str): The start string to generate the text.
+            length (int): Length of generated text.
+            temperature (float): A temperature value to sample the token.
+
+        Returns:
+            A list of generated text.
+
+        """
+
+        logger.debug(f'Generating text with length: {length} ...')
+
+        # Setting the evalution flag
+        self.eval()
+
+        # Pre-processing the start text into tokens
+        tokens = field.preprocess(start)
+
+        # Numericalizing the tokens
+        tokens = field.numericalize([tokens]).to(self.device)
+
+        # Inhibits the gradient from updating the parameters
+        with torch.no_grad():
+            # Performs the initial encoding
+            conv, output = self.E(tokens)
+
+        # Removes the batch dimension from the tokens
+        tokens = tokens.squeeze(0)
+
+        # For every possible length
+        for i in range(length):
+            # Inhibits the gradient from updating the parameters
+            with torch.no_grad():
+                # Decodes only the last token, i.e., last sampled token
+                preds, _ = self.D(tokens[-1].unsqueeze(0), conv, output)
+
+            # Regularize the prediction with the temperature
+            preds /= temperature
+
+            # Samples a token from a categorical distribution based on the predictions
+            sampled_token = distributions.Categorical(logits=preds).sample()
+
+            # Concatenate the sampled token with the input tokens
+            tokens = torch.cat((tokens, sampled_token))
+
+        # Decodes the tokens into text
+        sampled_text = [field.vocab.itos[t] for t in tokens]
+
+        return sampled_text
+
+    def translate_text(self, start, src_field, trg_field, max_length=10):
+        """Translates text from the source vocabulary to the target vocabulary.
+
+        Note that you will need to implement this method directly on its child. Essentially,
+        each neural network has its own translation implementation.
+
+        Args:
+            start (str): The string to be translated.
+            src_field (torchtext.data.Field): Source vocabulary datatype instructions for tensor convertion.
+            trg_field (torchtext.data.Field): Target vocabulary datatype instructions for tensor convertion.
+            max_length (int): Maximum length of translated text.
+
+        Returns:
+            A list of translated text.
+
+        """
+
+        # Setting the evalution flag
+        self.eval()
+
+        # Pre-processing the start text into tokens
+        tokens = src_field.preprocess(start)
+
+        # Adding `<sos>`` and `<eos>` to the tokens
+        tokens = [src_field.init_token] + tokens + [src_field.eos_token]
+
+        # Numericalizing the tokens
+        tokens = src_field.numericalize([tokens]).to(self.device)
+
+        # Inhibits the gradient from updating the parameters
+        with torch.no_grad():
+            # Performs the initial encoding
+            hidden = context = self.E(tokens)
+
+        # Creating a tensor with `<sos>` token from target vocabulary
+        tokens = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)
+
+        # For every possible token in maximum length
+        for i in range(max_length):
+            # Inhibits the gradient from updating the parameters
+            with torch.no_grad():
+                # Decodes only the last token, i.e., last sampled token
+                preds, hidden = self.D(tokens[-1], hidden, context)
+
+            # Samples a token using argmax
+            sampled_token = preds.argmax(1)
+
+            # Concatenate the sampled token with the input tokens
+            tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))
+
+            # Check if has reached the end of string
+            if sampled_token == trg_field.vocab.stoi[trg_field.eos_token]:
+                # If yes, breaks the loop
+                break
+
+        # Decodes the tokens into text
+        translated_text = [trg_field.vocab.itos[t] for t in tokens]
+
+        return translated_text[1:]
+
+    def bleu(self, dataset, src_field, trg_field, max_length=50, n_grams=4):
+        """Calculates BLEU score over a dataset from its difference between targets and predictions.
+
+        Note that you will need to implement this method directly on its child. Essentially,
+        each neural network has its own bleu implementation, due to having different translation methods.
+
+        Args:
+            dataset (torchtext.data.Dataset): Dataset to have its BLEU calculated.
+            src_field (torchtext.data.Field): Source vocabulary datatype instructions for tensor convertion.
+            trg_field (torchtext.data.Field): Target vocabulary datatype instructions for tensor convertion.
+            max_length (int): Maximum length of translated text.
+            n_grams (int): Maxmimum n-grams to be used.
+
+        Returns:
+            BLEU score from input dataset.
+
+        """
+
+        logger.info(f'Calculating BLEU with {n_grams}-grams ...')
+
+        # Defines a list for holding the targets and predictions
+        targets, preds = [], []
+
+        # For every example in the dataset
+        for data in dataset:
+            # Calculates the prediction, i.e., translated text
+            pred = self.translate_text(data.text, src_field, trg_field, max_length)
+
+            # Appends the prediction without the `<eos>` token
+            preds.append(pred[:-1])
+
+            # Appends an iterable of the target
+            targets.append([data.target])
+
+        # Calculates the BLEU score
+        bleu = bleu_score(preds, targets, max_n=n_grams)
+
+        logger.info(f'BLEU: {bleu}')
+
+        return bleu
diff --git a/textformer/models/decoders/conv.py b/textformer/models/decoders/conv.py
@@ -0,0 +1,155 @@
+import math
+
+import torch
+from torch import nn
+
+import textformer.utils.logging as l
+from textformer.core import Decoder
+from textformer.models.layers import ResidualAttention
+
+logger = l.get_logger(__name__)
+
+
+class ConvDecoder(Decoder):
+    def __init__(self, n_output=128, n_hidden=128, n_embedding=128, n_layers=1,
+                 kernel_size=3, dropout=0.5, scale=0.5, max_length=100, pad_token=None):
+        """Initializion method.
+
+        Args:
+            n_input (int): Number of input units.
+            n_hidden (int): Number of hidden units.
+            n_embedding (int): Number of embedding units.
+            n_layers (int): Number of convolutional layers.
+            kernel_size (int): Size of the convolutional kernels.
+            dropout (float): Amount of dropout to be applied.
+            scale (float): Value for the residual learning.
+            max_length (int): Maximum length of positional embeddings.
+            pad_token (int): The index of a padding token.
+
+        """
+
+        logger.info('Overriding class: Encoder -> ConvDecoder.')
+
+        # Overriding its parent class
+        super(ConvDecoder, self).__init__()
+
+        # Number of output units
+        self.n_output = n_output
+
+        # Number of hidden units
+        self.n_hidden = n_hidden
+
+        # Number of embedding units
+        self.n_embedding = n_embedding
+
+        # Number of layers
+        self.n_layers = n_layers
+
+        # Checks if kernel size is even
+        if kernel_size % 2 == 0:
+            # If yes, adds one to make it odd
+            self.kernel_size = kernel_size + 1
+
+        # If it is odd
+        else:
+            # Uses the inputted kernel size
+            self.kernel_size = kernel_size
+
+        # Maximum length of positional embeddings
+        self.max_length = max_length
+
+        # Scale for the residual learning
+        self.scale = math.sqrt(scale)
+
+        # Padding token index
+        self.pad_token = pad_token
+
+        # Embedding layers
+        self.embedding = nn.Embedding(n_output, n_embedding)
+        self.pos_embedding = nn.Embedding(max_length, n_embedding)
+
+        # Fully connected layers
+        self.fc1 = nn.Linear(n_embedding, n_hidden)
+        self.fc2 = nn.Linear(n_hidden, n_embedding)
+
+        # Residual Attention layer
+        self.a = ResidualAttention(n_hidden, n_embedding, self.scale)
+
+        # Convolutional layers
+        self.conv = nn.ModuleList([nn.Conv1d(in_channels=n_hidden, 
+                                              out_channels=2 * n_hidden, 
+                                              kernel_size=self.kernel_size)
+                                    for _ in range(n_layers)])
+
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout)
+
+        # Output layer
+        self.out = nn.Linear(n_embedding, n_output)
+
+        logger.debug(f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.conv}.')
+
+    def forward(self, y, enc_c, enc_o):
+        """Performs a forward pass over the architecture.
+
+        Args:
+            y (torch.Tensor): Tensor containing the true labels.
+            enc_c (torch.Tensor): Tensor containing the convolutional features.
+            enc_o (torch.Tensor): Tensor containing combined outputs.
+
+        Returns:
+            The output and attention values.
+
+        """
+
+        # Creates the positions tensor
+        pos = torch.arange(0, y.shape[1]).unsqueeze(0).repeat(y.shape[0], 1)
+
+        # Calculates the embedded outputs
+        y_embedded = self.embedding(y)
+        pos_embedded = self.pos_embedding(pos)
+
+        # Combines the embeddings
+        embedded = self.dropout(y_embedded + pos_embedded)
+
+        # Passing down to the first linear layer and permuting its dimension
+        hidden = self.fc1(embedded).permute(0, 2, 1)
+
+        # For every convolutional layer
+        for c in self.conv:
+            # Applying dropout
+            hidden = self.dropout(hidden)
+
+            # Padding tensor
+            pad = torch.zeros((hidden.shape[0], hidden.shape[1], self.kernel_size - 1))
+
+            # If padding token exists
+            if self.pad_token:
+                # Fills with its index
+                pad = pad.fill_(self.pad_token)
+
+            # Concatenating padding and convolutional features
+            conv = torch.cat((pad, hidden), dim=2)
+
+            # Pass down through convolutional layer
+            conv = c(conv)
+
+            # Activates with a GLU function
+            conv = nn.functional.glu(conv, dim=1)
+
+            # Calculating attention
+            attention, conv = self.a(embedded, conv, enc_c, enc_o)
+
+            # Applying residual connections
+            conv = (conv + hidden) * self.scale
+
+            # Puts back to the next layer input
+            hidden = conv
+
+        # Passes down back to embedding size
+        conv = self.fc2(conv.permute(0, 2, 1))
+
+        # Calculates the outputs
+        output = self.out(self.dropout(conv))
+
+        return output, attention
diff --git a/textformer/models/encoders/conv.py b/textformer/models/encoders/conv.py
@@ -108,7 +108,7 @@ def forward(self, x):
         hidden = self.fc1(embedded).permute(0, 2, 1)
 
         # For every convolutional layer
-        for i, c in enumerate(self.conv):
+        for c in self.conv:
             # Pass down through convolutional layer
             conv = c(self.dropout(hidden))
 

diff --git a/textformer/models/layers/residual_attention.py b/textformer/models/layers/residual_attention.py
@@ -1,6 +1,5 @@
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 
 
 class ResidualAttention(nn.Module):