Skip to content
This repository has been archived by the owner on Apr 22, 2022. It is now read-only.

Commit

Permalink
Continuing to work on Transformer architecture.
Browse files Browse the repository at this point in the history
  • Loading branch information
gugarosa committed Jul 3, 2020
1 parent ef38f32 commit 41f7aac
Show file tree
Hide file tree
Showing 4 changed files with 238 additions and 80 deletions.
80 changes: 0 additions & 80 deletions textformer/models/encoders/multi_head.py

This file was deleted.

131 changes: 131 additions & 0 deletions textformer/models/encoders/self_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import math

import textformer.utils.logging as l
import torch
from textformer.core import Encoder
from textformer.models.layers import MultiHeadAttention, PositionWideForward
from torch import nn

logger = l.get_logger(__name__)


class SelfAttentionLayer(nn.Module):
"""A SelfAttentionLayer is used to supply the self-attention layer to the encoding part of the Transformer architecture.
"""

def __init__(self, n_hidden=128, n_forward=256, n_heads=3, dropout=0.1):
"""Initialization method.
Args:
n_hidden (int): Number of hidden units.
n_forward (int): Number of feed forward units.
n_heads (int): Number of attention heads.
dropout (float): Dropout probability.
"""

#
self.self_attn_layer_norm = nn.LayerNorm(n_hidden)

#
self.ff_layer_norm = nn.LayerNorm(n_hidden)

#
self.self_attention = MultiHeadAttention(n_hidden, n_heads, dropout)

#
self.positionwise_feedforward = PositionWideForward(
n_hidden, n_forward, dropout)

#
self.drop = nn.Dropout(dropout)

def forward(self, src, src_mask):
"""
"""

# Performs the self-attention mechanism
_src, _ = self.self_attention(src, src, src, src_mask)

# Performs the dropout with residual connection and layer normalization
src = self.self_attn_layer_norm(src + self.drop(_src))

# Performs the position-wise forwarding
_src = self.positionwise_feedforward(src)

# Performs the dropout with residual connection and layer normalization
src = self.ff_layer_norm(src + self.drop(_src))

return src


class SelfAttentionEncoder(Encoder):
"""A SelfAttentionEncoder is used to supply the encoding part of the Transformer architecture.
"""

def __init__(self, n_input=128, n_hidden=128, n_forward=256, n_layers=1,
n_heads=3, dropout=0.1, max_length=100):
"""Initializion method.
Args:
n_input (int): Number of input units.
n_hidden (int): Number of hidden units.
n_forward (int): Number of feed forward units.
n_layers (int): Number of attention layers.
n_heads (int): Number of attention heads.
dropout (float): Amount of dropout to be applied.
max_length (int): Maximum length of positional embeddings.
"""

logger.info('Overriding class: Encoder -> SelfAttentionEncoder.')

# Overriding its parent class
super(SelfAttentionEncoder, self).__init__()

# Number of input units
self.n_input = n_input

# Number of hidden units
self.n_hidden = n_hidden

# Number of feed forward units
self.n_forward = n_forward

# Number of attention layers
self.n_layers = n_layers

# Number of attention heads
self.n_heads = n_heads

# Maximum length of positional embeddings
self.max_length = max_length

# Scale for the residual learning
self.scale = math.sqrt(n_hidden)

# Embedding layers
self.embedding = nn.Embedding(n_input, n_hidden)
self.pos_embedding = nn.Embedding(max_length, n_hidden)

# Encoding layers
self.encoders = nn.ModuleList[SelfAttentionLayer(n_hidden, n_heads, n_forward, dropout) for _ in range(n_layers)]

# Dropout layer
self.dropout = nn.Dropout(dropout)

def forward(self, x, x_mask):
"""Performs a forward pass over the architecture.
Args:
x (torch.Tensor): Tensor containing the data.
x_mask (torch.Tensor): Tensor containing the masked data.
Returns:
The output values.
"""

pass
104 changes: 104 additions & 0 deletions textformer/models/layers/multi_head_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

import textformer.utils.constants as c


class MultiHeadAttention(nn.Module):
"""A MultiHeadAttention class is used to provide multi-head attention-based mechanisms in a neural network layer.
References:
A. Vaswani, et al. Attention is all you need. Advances in neural information processing systems (2017).
"""

def __init__(self, n_hidden, n_heads, dropout):
"""Initialization method.
Args:
n_hidden (int): Number of hidden units.
n_heads (int): Number of attention heads.
dropout (float): Dropout probability.
"""

# Overriding its parent class
super(MultiHeadAttention, self).__init__()

# Asserts if number of hidden units is divisible by number of heads
assert n_hidden % n_heads == 0

# Number of hidden units
self.n_hidden = n_hidden

# Number of attention heads
self.n_heads = n_heads

# Size of attention head
self.head_size = n_hidden // n_heads

# Linear projections (query, key and value)
self.q = nn.Linear(n_hidden, n_hidden)
self.k = nn.Linear(n_hidden, n_hidden)
self.v = nn.Linear(n_hidden, n_hidden)

# Output projection
self.out = nn.Linear(n_hidden, n_hidden)

# Dropout layer
self.drop = nn.Dropout(dropout)

# Scale for the residual connections
self.scale = math.sqrt(self.head_size)

def forward(self, query, key, value, mask=None):
"""Performs a forward pass over the layer.
Args:
q (torch.Tensor): Tensor containing the queries.
k (torch.Tensor): Tensor containing the keys.
v (torch.Tensor): Tensor containing the values.
m (torch.Tensor): Tensor containing the mask.
Returns:
The multi-head attention-based weights.
"""

# Gathers the batch size
batch_size = query.shape[0]

# Performs the linear projections to calculate Q, K and V
Q = self.q(query)
K = self.k(key)
V = self.v(value)

# Reshapes Q, K and V
Q = Q.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
K = K.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
V = V.view(batch_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)

# Calculates the energy
energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

# Checks if a mask is supplied
if mask is not None:
# Fills the energy with a low value where mask equals to zero
energy = energy.masked_fill(mask == 0, -c.EPSILON)

# Calculates the attention
attention = torch.softmax(energy, dim=-1)

# Performs the energy-value projection
x = (torch.matmul(self.drop(attention), V)).permute(0, 2, 1, 3)

# Reshapes back to hidden units
x = x.view(batch_size, -1, self.n_hidden)

# Passes down through output layer
x = self.out(x)

return x, attention
3 changes: 3 additions & 0 deletions textformer/utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# A epsilon constants defined a small value for avoiding
# unwanted mathematical errors, such as division by zero or log(0)
EPSILON = 1e-10

0 comments on commit 41f7aac

Please sign in to comment.