Continuing to work on Transformer architecture.

gugarosa · Jul 3, 2020 · 41f7aac · 41f7aac
1 parent ef38f32
commit 41f7aac
Show file tree

Hide file tree

Showing 4 changed files with 238 additions and 80 deletions.
diff --git a/textformer/models/encoders/multi_head.py b/textformer/models/encoders/multi_head.py
diff --git a/textformer/models/encoders/self_attention.py b/textformer/models/encoders/self_attention.py
@@ -0,0 +1,131 @@
+import math
+
+import textformer.utils.logging as l
+import torch
+from textformer.core import Encoder
+from textformer.models.layers import MultiHeadAttention, PositionWideForward
+from torch import nn
+
+logger = l.get_logger(__name__)
+
+
+class SelfAttentionLayer(nn.Module):
+    """A SelfAttentionLayer is used to supply the self-attention layer to the encoding part of the Transformer architecture.
+
+    """
+
+    def __init__(self, n_hidden=128, n_forward=256, n_heads=3, dropout=0.1):
+        """Initialization method.
+
+        Args:
+            n_hidden (int): Number of hidden units.
+            n_forward (int): Number of feed forward units.
+            n_heads (int): Number of attention heads.
+            dropout (float): Dropout probability.
+
+        """
+
+        #
+        self.self_attn_layer_norm = nn.LayerNorm(n_hidden)
+
+        #
+        self.ff_layer_norm = nn.LayerNorm(n_hidden)
+
+        #
+        self.self_attention = MultiHeadAttention(n_hidden, n_heads, dropout)
+
+        #
+        self.positionwise_feedforward = PositionWideForward(
+            n_hidden, n_forward, dropout)
+
+        #
+        self.drop = nn.Dropout(dropout)
+
+    def forward(self, src, src_mask):
+        """
+        """
+
+        # Performs the self-attention mechanism
+        _src, _ = self.self_attention(src, src, src, src_mask)
+
+        # Performs the dropout with residual connection and layer normalization
+        src = self.self_attn_layer_norm(src + self.drop(_src))
+
+        # Performs the position-wise forwarding
+        _src = self.positionwise_feedforward(src)
+
+        # Performs the dropout with residual connection and layer normalization
+        src = self.ff_layer_norm(src + self.drop(_src))
+
+        return src
+
+
+class SelfAttentionEncoder(Encoder):
+    """A SelfAttentionEncoder is used to supply the encoding part of the Transformer architecture.
+
+    """
+
+    def __init__(self, n_input=128, n_hidden=128, n_forward=256, n_layers=1,
+                 n_heads=3, dropout=0.1, max_length=100):
+        """Initializion method.
+
+        Args:
+            n_input (int): Number of input units.
+            n_hidden (int): Number of hidden units.
+            n_forward (int): Number of feed forward units.
+            n_layers (int): Number of attention layers.
+            n_heads (int): Number of attention heads.
+            dropout (float): Amount of dropout to be applied.
+            max_length (int): Maximum length of positional embeddings.
+
+        """
+
+        logger.info('Overriding class: Encoder -> SelfAttentionEncoder.')
+
+        # Overriding its parent class
+        super(SelfAttentionEncoder, self).__init__()
+
+        # Number of input units
+        self.n_input = n_input
+
+        # Number of hidden units
+        self.n_hidden = n_hidden
+
+        # Number of feed forward units
+        self.n_forward = n_forward
+
+        # Number of attention layers
+        self.n_layers = n_layers
+
+        # Number of attention heads
+        self.n_heads = n_heads
+
+        # Maximum length of positional embeddings
+        self.max_length = max_length
+
+        # Scale for the residual learning
+        self.scale = math.sqrt(n_hidden)
+
+        # Embedding layers
+        self.embedding = nn.Embedding(n_input, n_hidden)
+        self.pos_embedding = nn.Embedding(max_length, n_hidden)
+
+        # Encoding layers
+        self.encoders = nn.ModuleList[SelfAttentionLayer(n_hidden, n_heads, n_forward, dropout) for _ in range(n_layers)]
+
+        # Dropout layer
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, x_mask):
+        """Performs a forward pass over the architecture.
+
+        Args:
+            x (torch.Tensor): Tensor containing the data.
+            x_mask (torch.Tensor): Tensor containing the masked data.
+
+        Returns:
+            The output values.
+
+        """
+
+        pass
diff --git a/textformer/models/layers/multi_head_attention.py b/textformer/models/layers/multi_head_attention.py
@@ -0,0 +1,104 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import textformer.utils.constants as c
+
+
+class MultiHeadAttention(nn.Module):
+    """A MultiHeadAttention class is used to provide multi-head attention-based mechanisms in a neural network layer.
+
+    References:
+        A. Vaswani, et al. Attention is all you need. Advances in neural information processing systems (2017).
+
+    """
+
+    def __init__(self, n_hidden, n_heads, dropout):
+        """Initialization method.
+
+        Args:
+            n_hidden (int): Number of hidden units.
+            n_heads (int): Number of attention heads.
+            dropout (float): Dropout probability.
+
+        """
+
+        # Overriding its parent class
+        super(MultiHeadAttention, self).__init__()
+
+        # Asserts if number of hidden units is divisible by number of heads
+        assert n_hidden % n_heads == 0
+
+        # Number of hidden units
+        self.n_hidden = n_hidden
+
+        # Number of attention heads
+        self.n_heads = n_heads
+
+        # Size of attention head
+        self.head_size = n_hidden // n_heads
+
+        # Linear projections (query, key and value)
+        self.q = nn.Linear(n_hidden, n_hidden)
+        self.k = nn.Linear(n_hidden, n_hidden)
+        self.v = nn.Linear(n_hidden, n_hidden)
+
+        # Output projection
+        self.out = nn.Linear(n_hidden, n_hidden)
+
+        # Dropout layer
+        self.drop = nn.Dropout(dropout)
+
+        # Scale for the residual connections
+        self.scale = math.sqrt(self.head_size)
+
+    def forward(self, query, key, value, mask=None):
+        """Performs a forward pass over the layer.
+
+        Args:
+            q (torch.Tensor): Tensor containing the queries.
+            k (torch.Tensor): Tensor containing the keys.
+            v (torch.Tensor): Tensor containing the values.
+            m (torch.Tensor): Tensor containing the mask.
+
+        Returns:
+            The multi-head attention-based weights.
+
+        """
+
+        # Gathers the batch size
+        batch_size = query.shape[0]
+
+        # Performs the linear projections to calculate Q, K and V
+        Q = self.q(query)
+        K = self.k(key)
+        V = self.v(value)
+
+        # Reshapes Q, K and V
+        Q = Q.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
+        K = K.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
+        V = V.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
+
+        # Calculates the energy
+        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
+
+        # Checks if a mask is supplied
+        if mask is not None:
+            # Fills the energy with a low value  where mask equals to zero
+            energy = energy.masked_fill(mask == 0, -c.EPSILON)
+
+        # Calculates the attention
+        attention = torch.softmax(energy, dim=-1)
+
+        # Performs the energy-value projection
+        x = (torch.matmul(self.drop(attention), V)).permute(0, 2, 1, 3)
+
+        # Reshapes back to hidden units
+        x = x.view(batch_size, -1, self.n_hidden)
+
+        # Passes down through output layer
+        x = self.out(x)
+
+        return x, attention
diff --git a/textformer/utils/constants.py b/textformer/utils/constants.py
@@ -0,0 +1,3 @@
+# A epsilon constants defined a small value for avoiding
+# unwanted mathematical errors, such as division by zero or log(0)
+EPSILON = 1e-10