This seems more appropriate.

gugarosa · Jun 29, 2020 · ef38f32 · ef38f32
1 parent b1edee8
commit ef38f32
Show file tree

Hide file tree

Showing 12 changed files with 53 additions and 53 deletions.
diff --git a/examples/applications/translating/att_seq2seq_translation.py b/examples/applications/translating/att_seq2seq_translation.py
@@ -34,7 +34,7 @@
                          ignore_token=target_pad_index, init_weights=None, device=device)
 
 # Training the model
-att_seq2seq.fit(train_iterator, val_iterator, epochs=10)
+att_seq2seq.fit(train_iterator, val_iterator, epochs=1)
 
 # Evaluating the model
 att_seq2seq.evaluate(test_iterator)

diff --git a/examples/applications/translating/conv_seq2seq_translation.py b/examples/applications/translating/conv_seq2seq_translation.py
@@ -36,7 +36,7 @@
                            init_weights=None, device=device)
 
 # Training the model
-conv_seq2seq.fit(train_iterator, val_iterator, epochs=10)
+conv_seq2seq.fit(train_iterator, val_iterator, epochs=1)
 
 # Evaluating the model
 conv_seq2seq.evaluate(test_iterator)

diff --git a/textformer/models/att_seq2seq.py b/textformer/models/att_seq2seq.py
@@ -75,7 +75,7 @@ def forward(self, x, y, teacher_forcing_ratio=0.5):
         # For every possible token in the sequence
         for t in range(1, y.shape[0]):
             # Decodes the tensor
-            pred, hidden, _ = self.D(x, hidden, outputs)
+            pred, hidden, _ = self.D(x, outputs, hidden)
 
             # Gathers the prediction of current token
             preds[t] = pred
@@ -134,7 +134,7 @@ def generate_text(self, start, field, length=10, temperature=1.0):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden, _ = self.D(tokens[-1], hidden, outputs)
+                preds, hidden, _ = self.D(tokens[-1], outputs, hidden)
 
             # Regularize the prediction with the temperature
             preds /= temperature
@@ -195,7 +195,7 @@ def translate_text(self, start, src_field, trg_field, max_length=10):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden, att = self.D(tokens[-1], hidden, outputs)
+                preds, hidden, att = self.D(tokens[-1], outputs, hidden)
 
             # Retrieving current token attention values
             atts[i] = att

diff --git a/textformer/models/decoders/att_bi_gru.py b/textformer/models/decoders/att_bi_gru.py
@@ -57,13 +57,13 @@ def __init__(self, n_output=128, n_hidden_enc=128, n_hidden_dec=128, n_embedding
         logger.debug(
             f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.rnn} | Attention: {self.a} | Output: {self.fc}.')
 
-    def forward(self, x, h, y):
+    def forward(self, x, o, h):
         """Performs a forward pass over the architecture.
 
         Args:
-            x (torch.Tensor): Tensor containing the data.
+            x (torch.Tensor): Tensor containing the input data.
+            o (torch.Tensor): Tensor containing the encoded outputs.
             h (torch.Tensor): Tensor containing the hidden states.
-            y (torch.Tensor): Tensor containing the encoder outputs.
 
         Returns:
             The prediction and hidden state.
@@ -74,10 +74,10 @@ def forward(self, x, h, y):
         embedded = self.dropout(self.embedding(x.unsqueeze(0)))
 
         # Calculates the attention
-        attention = self.a(h, y).unsqueeze(1)
+        attention = self.a(o, h).unsqueeze(1)
 
         # Permutes the encoder outputs
-        encoder_outputs = y.permute(1, 0, 2)
+        encoder_outputs = o.permute(1, 0, 2)
 
         # Calculates the weights from the attention-based layer
         weighted = torch.bmm(attention, encoder_outputs).permute(1, 0, 2)

diff --git a/textformer/models/decoders/conv.py b/textformer/models/decoders/conv.py
@@ -89,13 +89,13 @@ def __init__(self, n_output=128, n_hidden=128, n_embedding=128, n_layers=1,
 
         logger.debug(f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.conv}.')
 
-    def forward(self, y, enc_c, enc_o):
+    def forward(self, y, c, o):
         """Performs a forward pass over the architecture.
 
         Args:
             y (torch.Tensor): Tensor containing the true labels.
-            enc_c (torch.Tensor): Tensor containing the convolutional features.
-            enc_o (torch.Tensor): Tensor containing combined outputs.
+            c (torch.Tensor): Tensor containing the convolutional features.
+            o (torch.Tensor): Tensor containing combined outputs.
 
         Returns:
             The output and attention values.
@@ -116,7 +116,7 @@ def forward(self, y, enc_c, enc_o):
         hidden = self.fc1(embedded).permute(0, 2, 1)
 
         # For every convolutional layer
-        for c in self.conv:
+        for layer in self.conv:
             # Applying dropout
             hidden = self.dropout(hidden)
 
@@ -132,13 +132,13 @@ def forward(self, y, enc_c, enc_o):
             conv = torch.cat((pad, hidden), dim=2)
 
             # Pass down through convolutional layer
-            conv = c(conv)
+            conv = layer(conv)
 
             # Activates with a GLU function
             conv = nn.functional.glu(conv, dim=1)
 
             # Calculating attention
-            attention, conv = self.a(embedded, conv, enc_c, enc_o)
+            attention, conv = self.a(embedded, conv, c, o)
 
             # Applying residual connections
             conv = (conv + hidden) * self.scale

diff --git a/textformer/models/decoders/gru.py b/textformer/models/decoders/gru.py
@@ -52,11 +52,11 @@ def __init__(self, n_output=128, n_hidden=128, n_embedding=128, dropout=0.5):
         logger.debug(
             f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.rnn} | Output: {self.fc}.')
 
-    def forward(self, x_enc, h, c):
+    def forward(self, x, h, c):
         """Performs a forward pass over the architecture.
 
         Args:
-            x_enc (torch.Tensor): Tensor containing the encoded data.
+            x_enc (torch.Tensor): Tensor containing the input data.
             h (torch.Tensor): Tensor containing the hidden states.
             c (torch.Tensor): Tensor containing the context.
 
@@ -66,7 +66,7 @@ def forward(self, x_enc, h, c):
         """
 
         # Calculates the embedded layer
-        embedded = self.dropout(self.embedding(x_enc.unsqueeze(0)))
+        embedded = self.dropout(self.embedding(x.unsqueeze(0)))
 
         # Concatenating the embedding and context tensors
         concat_embedded = torch.cat((embedded, c), dim=2)

diff --git a/textformer/models/decoders/lstm.py b/textformer/models/decoders/lstm.py
@@ -55,11 +55,11 @@ def __init__(self, n_output=128, n_hidden=128, n_embedding=128, n_layers=1, drop
         logger.debug(
             f'Size: ({self.n_output}, {self.n_hidden}) | Embeddings: {self.n_embedding} | Core: {self.rnn} | Output: {self.fc}.')
 
-    def forward(self, x_enc, h, c):
+    def forward(self, x, h, c):
         """Performs a forward pass over the architecture.
 
         Args:
-            x_enc (torch.Tensor): Tensor containing the encoded data.
+            x_enc (torch.Tensor): Tensor containing the input data.
             h (torch.Tensor): Tensor containing the hidden states.
             c (torch.Tensor): Tensor containing the cell.
 
@@ -69,7 +69,7 @@ def forward(self, x_enc, h, c):
         """
 
         # Calculates the embedded layer
-        embedded = self.dropout(self.embedding(x_enc.unsqueeze(0)))
+        embedded = self.dropout(self.embedding(x.unsqueeze(0)))
 
         # Calculates the RNN layer
         output, (hidden, cell) = self.rnn(embedded, (h, c))

diff --git a/textformer/models/decoders/multi_head.py b/textformer/models/decoders/multi_head.py
@@ -68,7 +68,7 @@ def __init__(self, n_output=128, n_hidden=128, n_forward=256, n_layers=1,
         # Output layer
         self.out = nn.Linear(n_hidden, n_output)
 
-    def forward(self, y, y_mask, x_enc, x_mask):
+    def forward(self, y, y_mask, x, x_mask):
         """Performs a forward pass over the architecture.
 
         Args:

diff --git a/textformer/models/encoders/bi_gru.py b/textformer/models/encoders/bi_gru.py
@@ -70,8 +70,8 @@ def forward(self, x):
         # Calculates the RNN outputs
         outputs, hidden = self.rnn(embedded)
 
-        # Initial Decoder hidden layer is the final hidden state of the Encoder forward and backward RNNs
-        # Also, they are fed through a Linear layer
+        # Calculates the final hidden state of the encoder forward and backward RNNs
+        # Also, they are fed through a linear layer
         hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
 
         return outputs, hidden
diff --git a/textformer/models/joint_seq2seq.py b/textformer/models/joint_seq2seq.py
@@ -69,12 +69,12 @@ def forward(self, x, y, teacher_forcing_ratio=0.5):
         hidden = context = self.E(x)
 
         # Make sure that the first decoding will come from the true labels
-        x_enc = y[0, :]
+        x = y[0, :]
 
         # For every possible token in the sequence
         for t in range(1, y.shape[0]):
             # Decodes the tensor
-            pred, hidden = self.D(x_enc, hidden, context)
+            pred, hidden = self.D(x, hidden, context)
 
             # Gathers the prediction of current token
             preds[t] = pred
@@ -85,12 +85,12 @@ def forward(self, x, y, teacher_forcing_ratio=0.5):
             # If teacher forcing should be used
             if teacher_forcing:
                 # Gathers the new input from the true labels
-                x_enc = y[t]
+                x = y[t]
 
             # If teacher forcing should not be used
             else:
                 # Gathers the new input from the best prediction
-                x_enc = pred.argmax(1)
+                x = pred.argmax(1)
 
         return preds
 
@@ -126,14 +126,14 @@ def generate_text(self, start, field, length=10, temperature=1.0):
             hidden = context = self.E(tokens)
 
         # Removes the batch dimension from the tokens
-        tokens_enc = tokens.squeeze(0)
+        tokens = tokens.squeeze(0)
 
         # For every possible length
         for i in range(length):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden = self.D(tokens_enc[-1], hidden, context)
+                preds, hidden = self.D(tokens[-1], hidden, context)
 
             # Regularize the prediction with the temperature
             preds /= temperature
@@ -142,10 +142,10 @@ def generate_text(self, start, field, length=10, temperature=1.0):
             sampled_token = distributions.Categorical(logits=preds).sample()
 
             # Concatenate the sampled token with the input tokens
-            tokens_enc = torch.cat((tokens_enc, sampled_token.unsqueeze(0)))
+            tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))
 
         # Decodes the tokens into text
-        sampled_text = [field.vocab.itos[t] for t in tokens_enc]
+        sampled_text = [field.vocab.itos[t] for t in tokens]
 
         return sampled_text
 
@@ -184,28 +184,28 @@ def translate_text(self, start, src_field, trg_field, max_length=10):
             hidden = context = self.E(tokens)
 
         # Creating a tensor with `<sos>` token from target vocabulary
-        tokens_enc = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)
+        tokens = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)
 
         # For every possible token in maximum length
         for i in range(max_length):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden = self.D(tokens_enc[-1], hidden, context)
+                preds, hidden = self.D(tokens[-1], hidden, context)
 
             # Samples a token using argmax
             sampled_token = preds.argmax(1)
 
             # Concatenate the sampled token with the input tokens
-            tokens_enc = torch.cat((tokens_enc, sampled_token.unsqueeze(0)))
+            tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))
 
             # Check if has reached the end of string
             if sampled_token == trg_field.vocab.stoi[trg_field.eos_token]:
                 # If yes, breaks the loop
                 break
 
         # Decodes the tokens into text
-        translated_text = [trg_field.vocab.itos[t] for t in tokens_enc]
+        translated_text = [trg_field.vocab.itos[t] for t in tokens]
 
         return translated_text[1:]
 

diff --git a/textformer/models/layers/attention.py b/textformer/models/layers/attention.py
@@ -30,23 +30,23 @@ def __init__(self, n_hidden_enc, n_hidden_dec):
         # Defining the weight-based layer
         self.v = nn.Linear(n_hidden_dec, 1, bias=False)
 
-    def forward(self, h, y):
+    def forward(self, o, h):
         """Performs a forward pass over the layer.
 
         Args:
+            o (torch.Tensor): Tensor containing the encoded outputs.
             h (torch.Tensor): Tensor containing the hidden states.
-            y (torch.Tensor): Tensor containing the encoder outputs.
 
         Returns:
             The attention-based weights.
 
         """
 
         # Repeating the decoder hidden states as its smaller than the encoder ones
-        hidden = h.unsqueeze(1).repeat(1, y.shape[0], 1)
+        hidden = h.unsqueeze(1).repeat(1, o.shape[0], 1)
 
         # Permuting the outputs
-        encoder_outputs = y.permute(1, 0, 2)
+        encoder_outputs = o.permute(1, 0, 2)
 
         # Calculating the energy between decoder hidden state and encoder hidden states
         energy = torch.tanh(self.e(torch.cat((hidden, encoder_outputs), dim=2)))

diff --git a/textformer/models/seq2seq.py b/textformer/models/seq2seq.py
@@ -70,12 +70,12 @@ def forward(self, x, y, teacher_forcing_ratio=0.5):
         hidden, cell = self.E(x)
 
         # Make sure that the first decoding will come from the true labels
-        x_enc = y[0, :]
+        x = y[0, :]
 
         # For every possible token in the sequence
         for t in range(1, y.shape[0]):
             # Decodes the tensor
-            pred, hidden, cell = self.D(x_enc, hidden, cell)
+            pred, hidden, cell = self.D(x, hidden, cell)
 
             # Gathers the prediction of current token
             preds[t] = pred
@@ -86,12 +86,12 @@ def forward(self, x, y, teacher_forcing_ratio=0.5):
             # If teacher forcing should be used
             if teacher_forcing:
                 # Gathers the new input from the true labels
-                x_enc = y[t]
+                x = y[t]
 
             # If teacher forcing should not be used
             else:
                 # Gathers the new input from the best prediction
-                x_enc = pred.argmax(1)
+                x = pred.argmax(1)
 
         return preds
 
@@ -127,14 +127,14 @@ def generate_text(self, start, field, length=10, temperature=1.0):
             hidden, cell = self.E(tokens)
 
         # Removes the batch dimension from the tokens
-        tokens_enc = tokens.squeeze(0)
+        tokens = tokens.squeeze(0)
 
         # For every possible length
         for i in range(length):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden, cell = self.D(tokens_enc[-1], hidden, cell)
+                preds, hidden, cell = self.D(tokens[-1], hidden, cell)
 
             # Regularize the prediction with the temperature
             preds /= temperature
@@ -143,10 +143,10 @@ def generate_text(self, start, field, length=10, temperature=1.0):
             sampled_token = distributions.Categorical(logits=preds).sample()
 
             # Concatenate the sampled token with the input tokens
-            tokens_enc = torch.cat((tokens_enc, sampled_token.unsqueeze(0)))
+            tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))
 
         # Decodes the tokens into text
-        sampled_text = [field.vocab.itos[t] for t in tokens_enc]
+        sampled_text = [field.vocab.itos[t] for t in tokens]
 
         return sampled_text
 
@@ -185,28 +185,28 @@ def translate_text(self, start, src_field, trg_field, max_length=10):
             hidden, cell = self.E(tokens)
 
         # Creating a tensor with `<sos>` token from target vocabulary
-        tokens_enc = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)
+        tokens = torch.LongTensor([trg_field.vocab.stoi[trg_field.init_token]]).unsqueeze(0).to(self.device)
 
         # For every possible token in maximum length
         for i in range(max_length):
             # Inhibits the gradient from updating the parameters
             with torch.no_grad():
                 # Decodes only the last token, i.e., last sampled token
-                preds, hidden, cell = self.D(tokens_enc[-1], hidden, cell)
+                preds, hidden, cell = self.D(tokens[-1], hidden, cell)
 
             # Samples a token using argmax
             sampled_token = preds.argmax(1)
 
             # Concatenate the sampled token with the input tokens
-            tokens_enc = torch.cat((tokens_enc, sampled_token.unsqueeze(0)))
+            tokens = torch.cat((tokens, sampled_token.unsqueeze(0)))
 
             # Check if has reached the end of string
             if sampled_token == trg_field.vocab.stoi[trg_field.eos_token]:
                 # If yes, breaks the loop
                 break
 
         # Decodes the tokens into text
-        translated_text = [trg_field.vocab.itos[t] for t in tokens_enc]
+        translated_text = [trg_field.vocab.itos[t] for t in tokens]
 
         return translated_text[1:]