In [1]:
import random
random.seed(10)

In [2]:
import re
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import *

In [3]:
data = open("data.txt", "r").read()

In [4]:
type(data)

str

In [5]:
print(data)

'Olá, como vai? Eu sou a Camila.\n'
'Olá, Camila, meu nome é Fernando. Muito prazer.\n'
'Prazer em conhecer você também. Como você está hoje?\n'
'Ótimo. Meu time de futebol venceu a competição.\n'
'Uau, Parabéns Fernando!\n'
'Obrigado Camila.\n'
'Vamos comer uma pizza mais tarde para celebrar?\n'
'Claro. Você recomenda algum restaurante Camila?\n'
'Sim, abriu um restaurante novo e dizem que a pizza de banana é fenomenal.\n'
'Ok. Nos encontramos no restaurante às sete da noite, pode ser?\n'
'Pode sim. Nos vemos mais tarde então.'


## vocab construction

In [6]:
# re.sub(...) function replaces all occurrences of the characters within the character class with an empty string
sentences = re.sub("[.,!?\\-']", '', data.lower().replace("\n", " ")).split('\\n')
sentences = [s.strip() for s in sentences]

print(sentences)

['olá como vai eu sou a camila', 'olá camila meu nome é fernando muito prazer', 'prazer em conhecer você também como você está hoje', 'ótimo meu time de futebol venceu a competição', 'uau parabéns fernando', 'obrigado camila', 'vamos comer uma pizza mais tarde para celebrar', 'claro você recomenda algum restaurante camila', 'sim abriu um restaurante novo e dizem que a pizza de banana é fenomenal', 'ok nos encontramos no restaurante às sete da noite pode ser', 'pode sim nos vemos mais tarde então']


In [7]:
words_lst = list( set( " ".join(sentences).split() ) )

print(words_lst)

['conhecer', 'algum', 'e', 'abriu', 'você', 'um', 'prazer', 'que', 'sou', 'dizem', 'pizza', 'time', 'da', 'fenomenal', 'vai', 'parabéns', 'pode', 'vamos', 'nos', 'sim', 'venceu', 'às', 'tarde', 'vemos', 'camila', 'para', 'hoje', 'então', 'mais', 'a', 'ótimo', 'competição', 'eu', 'em', 'ok', 'é', 'encontramos', 'obrigado', 'como', 'celebrar', 'também', 'de', 'banana', 'recomenda', 'uau', 'novo', 'noite', 'muito', 'futebol', 'no', 'sete', 'olá', 'restaurante', 'claro', 'está', 'ser', 'uma', 'meu', 'comer', 'fernando', 'nome']


In [8]:
# BERT special tokens
words_dict = {
    "[PAD]": 0, 
    "[CLS]": 1, 
    "[SEP]": 2, 
    "[MASK]": 3
    }

In [9]:
for i, w in enumerate(words_lst):
    words_dict[w] = i + 4

In [10]:
print(words_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, 'conhecer': 4, 'algum': 5, 'e': 6, 'abriu': 7, 'você': 8, 'um': 9, 'prazer': 10, 'que': 11, 'sou': 12, 'dizem': 13, 'pizza': 14, 'time': 15, 'da': 16, 'fenomenal': 17, 'vai': 18, 'parabéns': 19, 'pode': 20, 'vamos': 21, 'nos': 22, 'sim': 23, 'venceu': 24, 'às': 25, 'tarde': 26, 'vemos': 27, 'camila': 28, 'para': 29, 'hoje': 30, 'então': 31, 'mais': 32, 'a': 33, 'ótimo': 34, 'competição': 35, 'eu': 36, 'em': 37, 'ok': 38, 'é': 39, 'encontramos': 40, 'obrigado': 41, 'como': 42, 'celebrar': 43, 'também': 44, 'de': 45, 'banana': 46, 'recomenda': 47, 'uau': 48, 'novo': 49, 'noite': 50, 'muito': 51, 'futebol': 52, 'no': 53, 'sete': 54, 'olá': 55, 'restaurante': 56, 'claro': 57, 'está': 58, 'ser': 59, 'uma': 60, 'meu': 61, 'comer': 62, 'fernando': 63, 'nome': 64}


In [11]:
nums_dict = { i: k for k, i in words_dict.items()}

print(nums_dict)

{0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[MASK]', 4: 'conhecer', 5: 'algum', 6: 'e', 7: 'abriu', 8: 'você', 9: 'um', 10: 'prazer', 11: 'que', 12: 'sou', 13: 'dizem', 14: 'pizza', 15: 'time', 16: 'da', 17: 'fenomenal', 18: 'vai', 19: 'parabéns', 20: 'pode', 21: 'vamos', 22: 'nos', 23: 'sim', 24: 'venceu', 25: 'às', 26: 'tarde', 27: 'vemos', 28: 'camila', 29: 'para', 30: 'hoje', 31: 'então', 32: 'mais', 33: 'a', 34: 'ótimo', 35: 'competição', 36: 'eu', 37: 'em', 38: 'ok', 39: 'é', 40: 'encontramos', 41: 'obrigado', 42: 'como', 43: 'celebrar', 44: 'também', 45: 'de', 46: 'banana', 47: 'recomenda', 48: 'uau', 49: 'novo', 50: 'noite', 51: 'muito', 52: 'futebol', 53: 'no', 54: 'sete', 55: 'olá', 56: 'restaurante', 57: 'claro', 58: 'está', 59: 'ser', 60: 'uma', 61: 'meu', 62: 'comer', 63: 'fernando', 64: 'nome'}


In [12]:
VOCAB_SIZE = len(words_dict)
VOCAB_SIZE

65

In [13]:
sentences[0].split()

['olá', 'como', 'vai', 'eu', 'sou', 'a', 'camila']

In [14]:
tokens_lst = []

for sentence in sentences:
    tokens = [words_dict[s] for s in sentence.split()]
    tokens_lst.append(tokens)

In [15]:
tokens_lst

[[55, 42, 18, 36, 12, 33, 28],
 [55, 28, 61, 64, 39, 63, 51, 10],
 [10, 37, 4, 8, 44, 42, 8, 58, 30],
 [34, 61, 15, 45, 52, 24, 33, 35],
 [48, 19, 63],
 [41, 28],
 [21, 62, 60, 14, 32, 26, 29, 43],
 [57, 8, 47, 5, 56, 28],
 [23, 7, 9, 56, 49, 6, 13, 11, 33, 14, 45, 46, 39, 17],
 [38, 22, 40, 53, 56, 25, 54, 16, 50, 20, 59],
 [20, 23, 22, 27, 32, 26, 31]]

## hyperparams definition

In [16]:
BATCH_SIZE = 6
N_SEGMENTS = 2
DROPOUT    = 0.2

MAX_LEN     = 100
MAX_PRED    = 7 # max num of tokens that will be predicted 

N_LAYERS    = 6
N_HEADS     = 12

D_MODEL     = 768 # embedding dim
D_FF        = D_MODEL * 4 # feedforward dim
D_K = D_V   = 64

N_EPOCHS    = 10

In [17]:
sentences

['olá como vai eu sou a camila',
 'olá camila meu nome é fernando muito prazer',
 'prazer em conhecer você também como você está hoje',
 'ótimo meu time de futebol venceu a competição',
 'uau parabéns fernando',
 'obrigado camila',
 'vamos comer uma pizza mais tarde para celebrar',
 'claro você recomenda algum restaurante camila',
 'sim abriu um restaurante novo e dizem que a pizza de banana é fenomenal',
 'ok nos encontramos no restaurante às sete da noite pode ser',
 'pode sim nos vemos mais tarde então']

In [18]:
a, b = randrange(len(sentences)), randrange(len(sentences))
a, b

(9, 0)

In [19]:
aa, bb = tokens_lst[a], tokens_lst[b]
aa, bb

([38, 22, 40, 53, 56, 25, 54, 16, 50, 20, 59], [55, 42, 18, 36, 12, 33, 28])

In [20]:
input_ids = [words_dict["[CLS]"]] + aa + [words_dict["[SEP]"]] + bb + [words_dict["[SEP]"]]
input_ids

[1,
 38,
 22,
 40,
 53,
 56,
 25,
 54,
 16,
 50,
 20,
 59,
 2,
 55,
 42,
 18,
 36,
 12,
 33,
 28,
 2]

In [21]:
segment_ids = [0] * (1 + len(aa) + 1) + [1] * (len(bb) + 1)
segment_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]

In [22]:
min(MAX_PRED, max(1, int(round(len(input_ids) * 0.15)))) 

3

In [23]:
mask_position_candidates = [i for i, token in enumerate(input_ids) if token != words_dict["[CLS]"] and token != words_dict["[SEP]"]]
mask_position_candidates

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19]

In [24]:
def create_batches(batch_size, sentences, tokenized_sentences, word_to_num_dict, num_to_word_dict, max_predictable_tokens, vocab_size, max_len):

    batch = []
    
    positive = negative = 0

    while (positive != batch_size/2) or (negative != batch_size/2):

        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b = tokenized_sentences[tokens_a_index], tokenized_sentences[tokens_b_index]

        input_ids = [word_to_num_dict["[CLS]"]] + tokens_a + [word_to_num_dict["[SEP]"]] + tokens_b + [word_to_num_dict["[SEP]"]]
        segment_ids = segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        n_pred =  min(max_predictable_tokens, max(1, int(round(len(input_ids) * 0.15)))) 

        mask_position_candidates = [i for i, token in enumerate(input_ids) if token != word_to_num_dict["[CLS]"] and token != word_to_num_dict["[SEP]"]]
        shuffle(mask_position_candidates)

        masked_tokens, masked_positions = [], []
        for pos in mask_position_candidates[:n_pred]:
            masked_tokens.append(input_ids[pos])
            masked_positions.append(pos)
            
            if random() < 0.8:
                input_ids[pos] = word_to_num_dict["[MASK]"]
            elif random() < 0.5:
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word_to_num_dict[num_to_word_dict[index]]

        padding = max_len - len(input_ids)
        input_ids.extend([0] * padding)
        segment_ids.extend([0] * padding)

        if max_predictable_tokens > n_pred:
            padding = max_predictable_tokens - n_pred
            masked_tokens.extend([0] * padding)
            masked_positions.extend([0] * padding) 

        if (tokens_a_index + 1 == tokens_b_index) and (positive < batch_size / 2):
            batch.append([input_ids, segment_ids, masked_tokens, masked_positions, True])
            positive += 1

        elif (tokens_a_index + 1 != tokens_b_index) and (negative < batch_size / 2):
            batch.append([input_ids, segment_ids, masked_tokens, masked_positions, False])
            negative += 1

    return batch

In [25]:
# BATCH_SIZE = 6
# N_SEGMENTS = 2
# DROPOUT    = 0.2

# MAX_LEN     = 100
# MAX_PRED    = 7 # max num of tokens that will be predicted 

# N_LAYERS    = 6
# N_HEADS     = 12

# D_MODEL     = 768 # embedding dim
# D_FF        = D_MODEL * 4 # feedforward dim
# D_K = D_V   = 64

# N_EPOCHS    = 10

In [26]:
# from collections import OrderedDict

# d1 = OrderedDict(sorted(nums_dict.items()))
# print(d1)

In [27]:
batch = create_batches(batch_size=BATCH_SIZE, 
                       sentences=sentences, 
                       tokenized_sentences=tokens_lst, 
                       word_to_num_dict=words_dict, 
                       num_to_word_dict=nums_dict, 
                       max_predictable_tokens=MAX_PRED, 
                       vocab_size=VOCAB_SIZE, 
                       max_len=MAX_LEN)

In [28]:
input_ids, segment_ids, masked_tokens, masked_positions, is_next = map(torch.LongTensor, zip(*batch))

In [29]:
input_ids[0]

tensor([ 1, 21, 62, 60, 14, 32,  3,  3, 43,  2, 57,  8, 47,  5, 56, 28,  2,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [30]:
segment_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [31]:
masked_tokens[0]

tensor([62, 26, 29,  0,  0,  0,  0])

In [32]:
masked_positions[0]

tensor([2, 6, 7, 0, 0, 0, 0])

In [33]:
is_next[0]

tensor(1)

In [34]:
def get_attention_pad_masked(query_sequence, key_sequence):
    
    batch_size, len_q = query_sequence.size()
    batch_size, len_k = key_sequence.size()
    
    pad_attn_masked = key_sequence.data.eq(0).unsqueeze(1)
    
    return pad_attn_masked.expand(batch_size, len_q, len_k)

In [35]:
input_ids[0]

tensor([ 1, 21, 62, 60, 14, 32,  3,  3, 43,  2, 57,  8, 47,  5, 56, 28,  2,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [36]:
get_attention_pad_masked(input_ids, input_ids)[0][0]

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True])

In [37]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [38]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, n_segments):
        super().__init__()

        self.tok_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)
        self.seg_embed = nn.Embedding(n_segments, d_model)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        # print("...forward step")
        seq_len = x.size(1)

        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)

        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)

        return self.norm(embedding)

In [39]:
vocab_size = 10000
d_model = 512
max_len = 100
n_segments = 2

embedding_layer = Embedding(vocab_size, d_model, max_len, n_segments)

# Input example
batch_size = 32
seq_len = 20
x = torch.randint(0, vocab_size, (batch_size, seq_len))
seg = torch.randint(0, n_segments, (batch_size, seq_len))

output = embedding_layer(x, seg)
print(output.shape)  # Output: [batch_size, seq_len, d_model]


torch.Size([32, 20, 512])


In [40]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, Q, K, V, attn_mask):
        d_k = Q.size(-1)  # Dimension of the last axis of K
        
        # Compute scaled attention scores
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        
        # Apply the attention mask
        scores.masked_fill(attn_mask, -1e9)
        
        # Compute attention probabilities
        attn = nn.Softmax(dim=-1)(scores)
        
        # Compute context using attention probabilities
        context = torch.matmul(attn, V)
        
        return context, attn

# Example dimensions
batch_size = 2
seq_len = 4
d_k = d_v = 8

# Input tensors
Q = torch.rand(batch_size, seq_len, d_k)  # Query
K = torch.rand(batch_size, seq_len, d_k)  # Key
V = torch.rand(batch_size, seq_len, d_v)  # Value

# Attention mask
attn_mask = torch.zeros(batch_size, seq_len, seq_len).bool()

# Initialize and run the model
attention_layer = ScaledDotProductAttention()
context, attn = attention_layer(Q, K, V, attn_mask)

print("Context shape:", context.shape)  # Expected: [batch_size, seq_len, d_v]
print("Attention shape:", attn.shape)  # Expected: [batch_size, seq_len, seq_len]


Context shape: torch.Size([2, 4, 8])
Attention shape: torch.Size([2, 4, 4])


In [41]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, d_k, d_v):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_k = d_k
        self.d_v = d_v

        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        
        self.fc = nn.Linear(n_heads * d_v, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
        self.attention = ScaledDotProductAttention()

    def forward(self, Q, K, V, attn_mask):
        residual = Q
        batch_size = Q.size(0)

        # Linear projections and reshaping
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1, 2)

        # Masking
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)

        # Scaled dot-product attention
        context, attn = self.attention(q_s, k_s, v_s, attn_mask)

        # Combine heads and reshape
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v)

        # Final linear projection
        output = self.fc(context)

        # Add residual connection and layer normalization
        return self.layer_norm(output + residual), attn

In [42]:
# Example dimensions
batch_size = 2
seq_len = 4
d_model = 16  # Input feature size
n_heads = 4   # Number of attention heads
d_k = d_v = 4 # Size of each attention head

# Input tensors
Q = torch.rand(batch_size, seq_len, d_model)
K = torch.rand(batch_size, seq_len, d_model)
V = torch.rand(batch_size, seq_len, d_model)
attn_mask = torch.zeros(batch_size, seq_len, seq_len).bool()  # No masking

# Instantiate and apply MultiHeadAttention
multihead_attention = MultiHeadAttention(d_model, n_heads, d_k, d_v)
output, attn_weights = multihead_attention(Q, K, V, attn_mask)

print("Output shape:", output.shape)  # Expected: [batch_size, seq_len, d_model]
print("Attention weights shape:", attn_weights.shape)  # Expected: [batch_size, n_heads, seq_len, seq_len]

Output shape: torch.Size([2, 4, 16])
Attention weights shape: torch.Size([2, 4, 4, 4])


In [43]:
VOCAB_SIZE, D_MODEL, MAX_LEN, N_SEGMENTS

(65, 768, 100, 2)

In [44]:
embedding = Embedding(VOCAB_SIZE, D_MODEL, MAX_LEN, N_SEGMENTS)

embedded  = embedding(input_ids, segment_ids)

In [45]:
attention_mask = get_attention_pad_masked(input_ids, input_ids)

In [46]:
attention_mask[0][0]

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True])

In [47]:
multihead_attention = MultiHeadAttention(D_MODEL, N_HEADS, D_K, D_V)(embedded, embedded, embedded, attention_mask)

In [48]:
multihead_attention[0][0]

tensor([[ 0.8016, -0.7674, -0.6850,  ..., -0.3848, -0.7357, -0.9320],
        [ 0.5090,  0.3723,  0.2620,  ...,  1.2269, -0.2860, -0.7234],
        [ 0.9513, -0.3783, -0.3725,  ...,  0.1868, -0.0491, -0.2502],
        ...,
        [-0.8316,  0.4293, -0.7776,  ..., -0.5132,  1.6908, -0.1155],
        [-1.1358,  0.5712, -0.8526,  ..., -0.2703,  1.4036, -0.4805],
        [-0.9678,  0.1488, -0.5751,  ..., -0.4267,  1.2869, -0.2497]],
       grad_fn=<SelectBackward0>)

In [49]:
output, A = multihead_attention

In [50]:
# Define the GELU activation function
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / np.sqrt(2.0)))


# Define the Positional Feed Forward Network
class PoswiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForward, self).__init__()

        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

        # self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, x):
        # Apply first linear transformation, GELU activation, second linear transformation
        residual = x
        x = self.fc2(gelu(self.fc1(x)))
        # Add residual connection and apply layer normalization
        return x # self.layer_norm(x + residual)


# Define the Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_k, d_v, d_ff):
        super(EncoderLayer, self).__init__()
        
        # Multi-head attention layer
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads, d_k, d_v)
        # Positional feed-forward network
        self.pos_ffn = PoswiseFeedForward(d_model, d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        # Apply self-attention
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        # Apply positional feed-forward network
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs, attn

In [51]:
# Example dimensions
batch_size = 2
seq_len = 4
d_model = 16  # Dimension of input/output embeddings
d_ff = 64     # Dimension of feed-forward layer
n_heads = 4   # Number of attention heads
d_k = d_v = 4 # Dimension of each attention head

# Input tensors
enc_inputs = torch.rand(batch_size, seq_len, d_model)  # Encoder inputs
enc_self_attn_mask = torch.zeros(batch_size, seq_len, seq_len).bool()  # No masking

# Instantiate EncoderLayer
encoder_layer = EncoderLayer(d_model, n_heads, d_k, d_v, d_ff)

# Forward pass through the encoder layer
enc_outputs, attn_weights = encoder_layer(enc_inputs, enc_self_attn_mask)

print("Encoder Outputs Shape:", enc_outputs.shape)  # Expected: [batch_size, seq_len, d_model]
print("Attention Weights Shape:", attn_weights.shape)  # Expected: [batch_size, n_heads, seq_len, seq_len]

Encoder Outputs Shape: torch.Size([2, 4, 16])
Attention Weights Shape: torch.Size([2, 4, 4, 4])


In [52]:
class BERT(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, n_heads, d_k, d_v, d_ff, n_layers, n_segments):
        super(BERT, self).__init__()

        # Embedding layer
        self.embedding = Embedding(vocab_size, max_len, d_model, n_segments)

        # Encoder layers
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_k, d_v, d_ff) for _ in range(n_layers)
        ])

        # Pooling and classification layers
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()

        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)

        self.classifier = nn.Linear(d_model, 2)

        # Decoder for masked language modeling (MLM)
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        # Embedding output
        output = self.embedding(input_ids, segment_ids)

        # Attention mask
        enc_self_attn_mask = get_attention_pad_masked(input_ids, input_ids)

        # Pass through encoder layers
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        # Pooling for classification
        h_pooled = self.activ1(self.fc(output[:, 0]))  # CLS token embedding
        logits_clsf = self.classifier(h_pooled)

        # Gather masked positions for MLM
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        h_masked = torch.gather(output, 1, masked_pos)
        h_masked = self.norm(self.activ2(self.linear(h_masked)))

        # MLM logits
        logits_lm = self.decoder(h_masked) + self.decoder_bias

        return logits_lm, logits_clsf

In [53]:
VOCAB_SIZE, MAX_LEN, D_MODEL, N_HEADS, D_K, D_K, D_FF, N_LAYERS, N_SEGMENTS

(65, 100, 768, 12, 64, 64, 3072, 6, 2)

In [54]:
# Instantiate the model
bert_model = BERT(VOCAB_SIZE, MAX_LEN, D_MODEL, N_HEADS, D_K, D_K, D_FF, N_LAYERS, N_SEGMENTS)

In [55]:
bert_model

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(65, 100)
    (pos_embed): Embedding(768, 100)
    (seg_embed): Embedding(2, 100)
    (norm): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
        (fc): Linear(in_features=768, out_features=768, bias=True)
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): ScaledDotProductAttention()
      )
      (pos_ffn): PoswiseFeedForward(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ1): Tanh()
  (linear): Linea

In [56]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bert_model.parameters(), lr = 0.001)

In [57]:
batch = create_batches(batch_size=BATCH_SIZE, 
                       sentences=sentences, 
                       tokenized_sentences=tokens_lst, 
                       word_to_num_dict=words_dict, 
                       num_to_word_dict=nums_dict, 
                       max_predictable_tokens=MAX_PRED, 
                       vocab_size=VOCAB_SIZE, 
                       max_len=MAX_LEN)

In [58]:
input_ids, segment_ids, masked_tokens, masked_positions, is_next = map(torch.LongTensor, zip(*batch))

In [59]:
print(f"Input IDs Shape: {input_ids.shape}", BATCH_SIZE, MAX_LEN)
print(f"Masked Positions Shape: {masked_positions.shape}")
print(f"Segment IDs Shape: {segment_ids.shape}", BATCH_SIZE, MAX_LEN)

Input IDs Shape: torch.Size([6, 100]) 6 100
Masked Positions Shape: torch.Size([6, 7])
Segment IDs Shape: torch.Size([6, 100]) 6 100


In [60]:
[BATCH_SIZE, MAX_LEN, D_MODEL], [BATCH_SIZE, N_HEADS, MAX_LEN, MAX_LEN]

([6, 100, 768], [6, 12, 100, 100])

In [61]:
# Input tensors
Q = torch.rand(BATCH_SIZE, MAX_LEN, D_MODEL)
K = torch.rand(BATCH_SIZE, MAX_LEN, D_MODEL)
V = torch.rand(BATCH_SIZE, MAX_LEN, D_MODEL)
attn_mask = torch.zeros(BATCH_SIZE, MAX_LEN, MAX_LEN).bool()  # No masking

# Instantiate and apply MultiHeadAttention
multihead_attention = MultiHeadAttention(D_MODEL, N_HEADS, D_K, D_V)
output, attn_weights = multihead_attention(Q, K, V, attn_mask)

print("Output shape:", output.shape)  # Expected: [BATCH_SIZE, MAX_LEN, D_MODEL]
print("Attention weights shape:", attn_weights.shape)  # Expected: [BATCH_SIZE, N_HEADS, MAX_LEN, MAX_LEN]

Output shape: torch.Size([6, 100, 768])
Attention weights shape: torch.Size([6, 12, 100, 100])


In [None]:

# Forward pass
logits_lm, logits_clsf = bert_model(input_ids, segment_ids, masked_positions)

print("Logits MLM Shape:", logits_lm.shape)  # Expected: [batch_size, num_masked_positions, vocab_size]
print("Logits Classification Shape:", logits_clsf.shape)  # Expected: [batch_size, 2]