Skip to content
This repository has been archived by the owner on Apr 22, 2022. It is now read-only.

Commit

Permalink
Finishing up convseq2seq, yet it still need some adjustments.
Browse files Browse the repository at this point in the history
  • Loading branch information
gugarosa committed Jun 2, 2020
1 parent 268af6d commit 6a4a053
Show file tree
Hide file tree
Showing 5 changed files with 317 additions and 4 deletions.
5 changes: 4 additions & 1 deletion examples/applications/generation/conv_seq2seq_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
# Builds the vocabulary
source.build_vocab(dataset, min_freq=1)

# Gathering the <pad> token index for further ignoring
pad_index = source.vocab.stoi[source.pad_token]

# Creates an iterator that backpropagates through time
train_iterator = BPTTIterator(dataset, batch_size=16, bptt_len=10, device=device)

# Creating the ConvSeq2Seq model
conv_seq2seq = ConvSeq2Seq(n_input=len(source.vocab), n_output=len(source.vocab),
n_hidden=512, n_embedding=256, n_layers=1, kernel_size=3,
scale=0.5, ignore_token=None, init_weights=None, device=device)
scale=0.5, ignore_token=pad_index, init_weights=None, device=device)

# Training the model
conv_seq2seq.fit(train_iterator, epochs=10)
Expand Down
156 changes: 156 additions & 0 deletions textformer/models/conv_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,159 @@ def forward(self, x, y, teacher_forcing_ratio=0.0):
preds, _ = self.D(y, conv, output)

return preds

def generate_text(self, start, field, length=10, temperature=1.0):
"""Generates text by feeding to the network the
current token (t) and predicting the next token (t+1).
Args:
field (torchtext.data.Field): Datatype instructions for tensor convertion.
start (str): The start string to generate the text.
length (int): Length of generated text.
temperature (float): A temperature value to sample the token.
Returns:
A list of generated text.
"""

logger.debug(f'Generating text with length: {length} ...')

# Setting the evalution flag
self.eval()

# Pre-processing the start text into tokens
tokens = field.preprocess(start)

# Numericalizing the tokens
tokens = field.numericalize([tokens]).to(self.device)

# Inhibits the gradient from updating the parameters
with torch.no_grad():
# Performs the initial encoding
conv, output = self.E(tokens)

# Removes the batch dimension from the tokens
tokens = tokens.squeeze(0)

# For every possible length
for i in range(length):
# Inhibits the gradient from updating the parameters
with torch.no_grad():
# Decodes only the last token, i.e., last sampled token
preds, _ = self.D(tokens[-1].unsqueeze(0), conv, output)

# Regularize the prediction with the temperature
preds /= temperature

# Samples a token from a categorical distribution based on the predictions
sampled_token = distributions.Categorical(logits=preds).sample()

# Concatenate the sampled token with the input tokens
tokens = torch.cat((tokens, sampled_token))

# Decodes the tokens into text
sampled_text = [field.vocab.itos[t] for t in tokens]

return sampled_text

def translate_text(self, start, src_field, trg_field, max_length=10):
"""Translates text from the source vocabulary to the target vocabulary.
Note that you will need to implement this method directly on its child. Essentially,
each neural network has its own translation implementation.
Args:
start (str): The string to be translated.
src_field (torchtext.data.Field): Source vocabulary datatype instructions for tensor convertion.
trg_field (torchtext.data.Field): Target vocabulary datatype instructions for tensor convertion.
max_length (int): Maximum length of translated text.
Returns:
A list of translated text.
"""

# Setting the evalution flag
self.eval()

# Pre-processing the start text into tokens
tokens = src_field.preprocess(start)

# Adding `<sos>`` and `<eos>` to the tokens
tokens = [src_field.init_token] + tokens + [src_field.eos_token]

# Numericalizing the tokens
tokens = src_field.numericalize([tokens]).to(self.device)

# Inhibits the gradient from updating the parameters
with torch.no_grad():
# Performs the initial encoding
hidden = context = self.E(tokens)

# Creating a tensor with `<sos>` token from target vocabulary
tokens = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)

# For every possible token in maximum length
for i in range(max_length):
# Inhibits the gradient from updating the parameters
with torch.no_grad():
# Decodes only the last token, i.e., last sampled token
preds, hidden = self.D(tokens[-1], hidden, context)

# Samples a token using argmax
sampled_token = preds.argmax(1)

# Concatenate the sampled token with the input tokens
tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))

# Check if has reached the end of string
if sampled_token == trg_field.vocab.stoi[trg_field.eos_token]:
# If yes, breaks the loop
break

# Decodes the tokens into text
translated_text = [trg_field.vocab.itos[t] for t in tokens]

return translated_text[1:]

def bleu(self, dataset, src_field, trg_field, max_length=50, n_grams=4):
"""Calculates BLEU score over a dataset from its difference between targets and predictions.
Note that you will need to implement this method directly on its child. Essentially,
each neural network has its own bleu implementation, due to having different translation methods.
Args:
dataset (torchtext.data.Dataset): Dataset to have its BLEU calculated.
src_field (torchtext.data.Field): Source vocabulary datatype instructions for tensor convertion.
trg_field (torchtext.data.Field): Target vocabulary datatype instructions for tensor convertion.
max_length (int): Maximum length of translated text.
n_grams (int): Maxmimum n-grams to be used.
Returns:
BLEU score from input dataset.
"""

logger.info(f'Calculating BLEU with {n_grams}-grams ...')

# Defines a list for holding the targets and predictions
targets, preds = [], []

# For every example in the dataset
for data in dataset:
# Calculates the prediction, i.e., translated text
pred = self.translate_text(data.text, src_field, trg_field, max_length)

# Appends the prediction without the `<eos>` token
preds.append(pred[:-1])

# Appends an iterable of the target
targets.append([data.target])

# Calculates the BLEU score
bleu = bleu_score(preds, targets, max_n=n_grams)

logger.info(f'BLEU: {bleu}')

return bleu
155 changes: 155 additions & 0 deletions textformer/models/decoders/conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import math

import torch
from torch import nn

import textformer.utils.logging as l
from textformer.core import Decoder
from textformer.models.layers import ResidualAttention

logger = l.get_logger(__name__)


class ConvDecoder(Decoder):
def __init__(self, n_output=128, n_hidden=128, n_embedding=128, n_layers=1,
kernel_size=3, dropout=0.5, scale=0.5, max_length=100, pad_token=None):
"""Initializion method.
Args:
n_input (int): Number of input units.
n_hidden (int): Number of hidden units.
n_embedding (int): Number of embedding units.
n_layers (int): Number of convolutional layers.
kernel_size (int): Size of the convolutional kernels.
dropout (float): Amount of dropout to be applied.
scale (float): Value for the residual learning.
max_length (int): Maximum length of positional embeddings.
pad_token (int): The index of a padding token.
"""

logger.info('Overriding class: Encoder -> ConvDecoder.')

# Overriding its parent class
super(ConvDecoder, self).__init__()

# Number of output units
self.n_output = n_output

# Number of hidden units
self.n_hidden = n_hidden

# Number of embedding units
self.n_embedding = n_embedding

# Number of layers
self.n_layers = n_layers

# Checks if kernel size is even
if kernel_size % 2 == 0:
# If yes, adds one to make it odd
self.kernel_size = kernel_size + 1

# If it is odd
else:
# Uses the inputted kernel size
self.kernel_size = kernel_size

# Maximum length of positional embeddings
self.max_length = max_length

# Scale for the residual learning
self.scale = math.sqrt(scale)

# Padding token index
self.pad_token = pad_token

# Embedding layers
self.embedding = nn.Embedding(n_output, n_embedding)
self.pos_embedding = nn.Embedding(max_length, n_embedding)

# Fully connected layers
self.fc1 = nn.Linear(n_embedding, n_hidden)
self.fc2 = nn.Linear(n_hidden, n_embedding)

# Residual Attention layer
self.a = ResidualAttention(n_hidden, n_embedding, self.scale)

# Convolutional layers
self.conv = nn.ModuleList([nn.Conv1d(in_channels=n_hidden,
out_channels=2 * n_hidden,
kernel_size=self.kernel_size)
for _ in range(n_layers)])

# Dropout layer
self.dropout = nn.Dropout(dropout)

# Output layer
self.out = nn.Linear(n_embedding, n_output)

logger.debug(f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.conv}.')

def forward(self, y, enc_c, enc_o):
"""Performs a forward pass over the architecture.
Args:
y (torch.Tensor): Tensor containing the true labels.
enc_c (torch.Tensor): Tensor containing the convolutional features.
enc_o (torch.Tensor): Tensor containing combined outputs.
Returns:
The output and attention values.
"""

# Creates the positions tensor
pos = torch.arange(0, y.shape[1]).unsqueeze(0).repeat(y.shape[0], 1)

# Calculates the embedded outputs
y_embedded = self.embedding(y)
pos_embedded = self.pos_embedding(pos)

# Combines the embeddings
embedded = self.dropout(y_embedded + pos_embedded)

# Passing down to the first linear layer and permuting its dimension
hidden = self.fc1(embedded).permute(0, 2, 1)

# For every convolutional layer
for c in self.conv:
# Applying dropout
hidden = self.dropout(hidden)

# Padding tensor
pad = torch.zeros((hidden.shape[0], hidden.shape[1], self.kernel_size - 1))

# If padding token exists
if self.pad_token:
# Fills with its index
pad = pad.fill_(self.pad_token)

# Concatenating padding and convolutional features
conv = torch.cat((pad, hidden), dim=2)

# Pass down through convolutional layer
conv = c(conv)

# Activates with a GLU function
conv = nn.functional.glu(conv, dim=1)

# Calculating attention
attention, conv = self.a(embedded, conv, enc_c, enc_o)

# Applying residual connections
conv = (conv + hidden) * self.scale

# Puts back to the next layer input
hidden = conv

# Passes down back to embedding size
conv = self.fc2(conv.permute(0, 2, 1))

# Calculates the outputs
output = self.out(self.dropout(conv))

return output, attention
2 changes: 1 addition & 1 deletion textformer/models/encoders/conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def forward(self, x):
hidden = self.fc1(embedded).permute(0, 2, 1)

# For every convolutional layer
for i, c in enumerate(self.conv):
for c in self.conv:
# Pass down through convolutional layer
conv = c(self.dropout(hidden))

Expand Down
3 changes: 1 addition & 2 deletions textformer/models/layers/residual_attention.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn


class ResidualAttention(nn.Module):
Expand Down

0 comments on commit 6a4a053

Please sign in to comment.