In [1]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
  # BERT Parameters
maxlen = 30 # maximum of length
batch_size = 6
max_pred = 5  # max tokens of prediction
n_layers = 6 # number of Encoder of Encoder Layer
n_heads = 12 # number of heads in Multi-Head Attention
d_model = 768 # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

In [3]:
# sample IsNext and NotNext to be same in small batch size
def make_batch(batch_size,sentences,token_list,word_dict,max_pred,maxlen,vocab_size,number_dict):
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            elif random() < 0.5:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch
# Proprecessing Finished

In [4]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k


In [5]:
def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


In [6]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,d_model,maxlen,n_segments):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

In [7]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask,d_k):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

In [8]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self,d_model,d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(gelu(self.fc1(x)))

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,d_k,d_v,n_heads):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask,n_heads,d_v,d_model,d_k):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask,d_k=d_k)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model,d_k,d_v,n_heads,d_ff):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads)
        self.pos_ffn = PoswiseFeedForwardNet(d_model=d_model,d_ff=d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask,n_heads=n_heads,d_v=d_v,d_model=d_model,d_k=d_k) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

In [11]:
class BERT(nn.Module):
    def __init__(self,d_model,n_layers,vocab_size,d_k,d_v,n_heads,d_ff):
        super(BERT, self).__init__()
        self.embedding = Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads,d_ff=d_ff) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        # it will be decided by first token(CLS)
        h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        # get masked position from final output of transformer.
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_clsf
text = (
    'Hello, how are you? I am Romeo.\n'
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'
    'Nice meet you too. How are you today?\n'
    'Great. My baseball team won the competition.\n'
    'Oh Congratulations, Juliet\n'
    'Thanks you Romeo'
)
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, w in enumerate(word_list):
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

token_list = list()
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

model = BERT(d_model=d_model,n_layers=n_layers,vocab_size=vocab_size,d_k = d_k,d_v=d_v,n_heads=n_heads,d_ff=d_ff)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch(batch_size=batch_size,sentences=sentences,token_list=token_list,word_dict=word_dict,max_pred=max_pred,maxlen=maxlen,vocab_size=vocab_size,number_dict=number_dict)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))


for epoch in range(100):
    pass
    # optimizer.zero_grad()
    # logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    # loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    # loss_lm = (loss_lm.float()).mean()
    # loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
    # loss = loss_lm + loss_clsf
    # if (epoch + 1) % 10 == 0:
    #     print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    # loss.backward()
    # optimizer.step()

# Learning about the preprocessing and Embedding

## data information

In [12]:
text = (
    'Hello, how are you? I am Romeo.\n'
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'
    'Nice meet you too. How are you today?\n'
    'Great. My baseball team won the competition.\n'
    'Oh Congratulations, Juliet\n'
    'Thanks you Romeo'
)
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, w in enumerate(word_list):
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

token_list = list()
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

In [13]:
# input is the text
print(text)

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thanks you Romeo


In [14]:
# getting list of sentence from the text
print(sentences)

['hello how are you i am romeo', 'hello romeo my name is juliet nice to meet you', 'nice meet you too how are you today', 'great my baseball team won the competition', 'oh congratulations juliet', 'thanks you romeo']


In [15]:
# splitting all sentence and getting the list of all unique words from text
print(word_list)

['am', 'my', 'too', 'juliet', 'you', 'name', 'competition', 'nice', 'i', 'great', 'how', 'meet', 'oh', 'team', 'won', 'baseball', 'romeo', 'today', 'are', 'is', 'hello', 'to', 'congratulations', 'thanks', 'the']


In [16]:
# mapping the word to unique number(token) (which is called data transformation)
print(word_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, 'am': 4, 'my': 5, 'too': 6, 'juliet': 7, 'you': 8, 'name': 9, 'competition': 10, 'nice': 11, 'i': 12, 'great': 13, 'how': 14, 'meet': 15, 'oh': 16, 'team': 17, 'won': 18, 'baseball': 19, 'romeo': 20, 'today': 21, 'are': 22, 'is': 23, 'hello': 24, 'to': 25, 'congratulations': 26, 'thanks': 27, 'the': 28}


In [17]:
# inverse mapping of the unique number (token) to the word for prediction purpose
number_dict

{0: '[PAD]',
 1: '[CLS]',
 2: '[SEP]',
 3: '[MASK]',
 4: 'am',
 5: 'my',
 6: 'too',
 7: 'juliet',
 8: 'you',
 9: 'name',
 10: 'competition',
 11: 'nice',
 12: 'i',
 13: 'great',
 14: 'how',
 15: 'meet',
 16: 'oh',
 17: 'team',
 18: 'won',
 19: 'baseball',
 20: 'romeo',
 21: 'today',
 22: 'are',
 23: 'is',
 24: 'hello',
 25: 'to',
 26: 'congratulations',
 27: 'thanks',
 28: 'the'}

In [18]:
# size of the dictionary 
vocab_size

29

In [19]:
# sentence and corresponding token_list
sentences,token_list

(['hello how are you i am romeo',
  'hello romeo my name is juliet nice to meet you',
  'nice meet you too how are you today',
  'great my baseball team won the competition',
  'oh congratulations juliet',
  'thanks you romeo'],
 [[24, 14, 22, 8, 12, 4, 20],
  [24, 20, 5, 9, 23, 7, 11, 25, 15, 8],
  [11, 15, 8, 6, 14, 22, 8, 21],
  [13, 5, 19, 17, 18, 28, 10],
  [16, 26, 7],
  [27, 8, 20]])

### Upto now the 
```

text -> sentence ( with removal of english [,./!?"'etc])
text -> extract each words -> word_list
word_list ->(mapped to ) -> token_list 
vocab_size == len(world_list) 

each token_list contains the token_id of each word
```


## Preprocessing the sentences and converting to masked and segmented form 

In [20]:


# sample IsNext and NotNext to be same in small batch size
def make_batch(batch_size,sentences,token_list,word_dict,max_pred,maxlen,vocab_size,number_dict):
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            elif random() < 0.5:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch
# Proprecessing Finished

In [21]:
batch = make_batch(batch_size=batch_size,sentences=sentences,token_list=token_list,word_dict=word_dict,max_pred=max_pred,maxlen=maxlen,vocab_size=vocab_size,number_dict=number_dict)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))



### Up to Now
>maxlen is maximum allowed words in the sentence
>
>n_sentence is the number of input sentence
>
>input_ids of each sentences : shape (n_sentence,maxlen)
>
>segment_ids of each sentence : shape (n_sentence,maxlen)
>
>masked_tokens of each sentence : shape (n_sentence,max_len)
>
>isNest of each sentence : shape (n_sentence)

In [22]:
input_ids.shape,input_ids

(torch.Size([6, 30]),
 tensor([[ 1, 11, 15,  8,  6,  3, 22,  8, 21,  2, 24,  3,  5,  9, 23,  7, 11, 25,
          15,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 16, 26,  7,  2, 24, 20, 25,  9, 23,  7, 11, 25,  3,  8,  2,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 11, 15, 14,  6, 14, 22,  8, 21,  2, 16,  3,  7,  2,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 24,  3,  5,  9,  3,  7, 11, 25, 15,  8,  2, 11, 15,  8,  3, 14, 22,
           8, 21,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 24,  3, 22,  8, 12,  4, 20,  2, 24,  3, 28,  9, 23,  7, 11, 25, 15,
           8,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 13,  5, 19, 17, 18, 28,  5,  2, 16, 26,  7,  2,  0,  0,  0,  0,  0,
           0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]))

In [23]:
segment_ids.size(),segment_ids

(torch.Size([6, 30]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0]]))

In [24]:
masked_tokens.shape,masked_tokens

(torch.Size([6, 5]),
 tensor([[ 8, 14, 20,  0,  0],
         [15,  5,  0,  0,  0],
         [26,  8,  0,  0,  0],
         [ 6, 23, 20,  0,  0],
         [ 5, 20, 14,  0,  0],
         [10, 17,  0,  0,  0]]))

In [25]:
masked_pos.shape,masked_pos

(torch.Size([6, 5]),
 tensor([[19,  5, 11,  0,  0],
         [13,  7,  0,  0,  0],
         [11,  3,  0,  0,  0],
         [15,  5,  2,  0,  0],
         [11, 10,  2,  0,  0],
         [ 7,  4,  0,  0,  0]]))

In [26]:
isNext.shape,isNext

(torch.Size([6]), tensor([0, 0, 0, 1, 1, 1]))

## Embedding

In [27]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,d_model,maxlen,n_segments):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

In [28]:
tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
norm = nn.LayerNorm(d_model)

vocab_size,d_model,maxlen,n_segments

(29, 768, 30, 2)

### up to now

* vocab_size is the size of dictionary
* d_model is the dimention of embedding to be performed
* n_segments is number of segment of that the sentence is segmented for the training
* max_len is the maximum length of each sentence

> input to the nn.Embedding() 
>
>A simple lookup table that stores embeddings of a fixed dictionary and size.
This module is often used to store word embeddings and retrieve them using indices. The input to the module is a list of indices, and the output is the corresponding word embeddings.
>
> nn.Embedding(unique_embdding_numbers,embeddng_dimension)

* for token embedding -> Embedding(vocab_size,d_model) since there is need of embedding for all words
* for position embedding -> Embedding(maxlen,d_model) since  there is only maxlen unique position
* for segment embedding -> Embeddding(n_segment,d_model) since there is only two segments of each sentence

d_model is common for embedding dimention because later on there are added

In [29]:
# when we supply input_ids of we get the corresponding embeddings
input_embeddings = tok_embed(input_ids)
input_ids.shape,input_embeddings.size() , input_embeddings

(torch.Size([6, 30]),
 torch.Size([6, 30, 768]),
 tensor([[[ 0.5609,  1.3569,  2.1036,  ..., -0.1036, -0.4089, -0.9100],
          [ 0.9850,  2.4662,  0.2452,  ..., -1.6064, -0.4717,  2.6118],
          [ 2.1469,  0.8265, -0.4476,  ...,  2.6386,  0.2635, -0.3848],
          ...,
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770],
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770],
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770]],
 
         [[ 0.5609,  1.3569,  2.1036,  ..., -0.1036, -0.4089, -0.9100],
          [-0.3572,  1.0167, -0.6824,  ..., -0.0752, -0.9614,  0.2373],
          [-1.7881, -2.3503,  1.4654,  ..., -0.0198, -0.2250, -0.3069],
          ...,
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770],
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770],
          [-0.6187,  0.9441, -0.1023,  ..., -1.0172,  1.0107,  0.5770]],
 
         [[ 0.5609,  1.3569,  2.1036,  ..., -0.1036

In [30]:
# when we input the position of each words in the sentence we will get the position embedding

seq_len = input_ids.size(1)
print(f"seq len : {seq_len}")
pos = torch.arange(seq_len, dtype=torch.long)
print(f"pos :{pos} \n pos shape : {pos.shape}")
pos = pos.unsqueeze(0).expand_as(input_ids)  # (seq_len,) -> (batch_size, seq_len)
print(f"pos :{pos} \n pos shape : {pos.shape}")
pos_embedding = pos_embed(pos)

seq len : 30
pos :tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]) 
 pos shape : torch.Size([30])
pos :tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 

In [31]:

input_ids.shape,pos.shape,pos_embedding.shape,pos_embedding

(torch.Size([6, 30]),
 torch.Size([6, 30]),
 torch.Size([6, 30, 768]),
 tensor([[[ 0.9660, -0.4986, -0.3459,  ...,  1.1357, -1.1111,  0.3926],
          [-0.1513, -0.3111,  0.0345,  ..., -0.1109,  1.0082, -0.3238],
          [-1.0357, -0.0436,  1.3298,  ..., -0.3325, -0.6350,  0.5875],
          ...,
          [-0.4983, -1.1975, -1.9269,  ..., -0.5401, -1.1938, -0.7832],
          [ 0.6676, -0.1693,  0.5767,  ...,  1.2680,  1.2784, -0.9582],
          [ 0.7070,  1.2319,  0.3383,  ..., -1.1873,  0.6569, -0.2067]],
 
         [[ 0.9660, -0.4986, -0.3459,  ...,  1.1357, -1.1111,  0.3926],
          [-0.1513, -0.3111,  0.0345,  ..., -0.1109,  1.0082, -0.3238],
          [-1.0357, -0.0436,  1.3298,  ..., -0.3325, -0.6350,  0.5875],
          ...,
          [-0.4983, -1.1975, -1.9269,  ..., -0.5401, -1.1938, -0.7832],
          [ 0.6676, -0.1693,  0.5767,  ...,  1.2680,  1.2784, -0.9582],
          [ 0.7070,  1.2319,  0.3383,  ..., -1.1873,  0.6569, -0.2067]],
 
         [[ 0.9660, -0.4986, 

In [32]:
# when we give segment_ids we will get the segment embedding
segment_ids.shape,segment_ids

(torch.Size([6, 30]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0]]))

In [33]:

seg_embedding = seg_embed(segment_ids)
segment_ids.shape,seg_embedding.shape,seg_embedding

(torch.Size([6, 30]),
 torch.Size([6, 30, 768]),
 tensor([[[-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          ...,
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930]],
 
         [[-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          ...,
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930],
          [-3.1898, -0.6039,  0.8623,  ..., -0.0888,  0.3330,  2.1930]],
 
         [[-3.1898, -0.6039,  0.8623,  ..., -0.0888

# Multi Head Attention

### What is query? what is key? what is value?

### first getting the attention padding mask

padding mask helps to distinquish between which is word and which is not word

In [34]:
def get_attn_pad_mask(seq_q, seq_k): # seq_q = input_ids , seq_k = input_ids
    batch_size, len_q = seq_q.size() # taking len_q as size of max_len
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k


In [35]:
enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
enc_self_attn_mask.shape,enc_self_attn_mask

(torch.Size([6, 30, 30]),
 tensor([[[False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          ...,
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True]],
 
         [[False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          ...,
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True]],
 
         [[False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          [False, False, False,  ...,  True,  True,  True],
          ...,
          [False, False

In [36]:
seq_q,seq_k = input_ids,input_ids
batch_size, len_q = seq_q.size()
print(f"batch size : {batch_size}")
batch_size, len_k = seq_k.size()
# eq(zero) is PAD token
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
enc_self_attn_mask =  pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

batch size : 6


```bash
>>> x = torch.tensor([[1], [2], [3]])
>>> x.size()
torch.Size([3, 1])
>>> x.expand(3, 4)
tensor([[ 1,  1,  1,  1],
        [ 2,  2,  2,  2],
        [ 3,  3,  3,  3]])

```

In [37]:
x = seq_k.data.eq(0)
print(f'padding: {x}')
print(x.shape)
x = x.unsqueeze(1)
print(x.shape)
x = x.expand(batch_size,len_q,len_k)
print(x.shape)
print(f"expanded shape: {x}")


padding: tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, Fa

In [38]:

print(x[0,0]==x[0,1])
print(x[0,1,0] == x[0,1,1])

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True])
tensor(True)


### upto now

shape of padding mask : (n_batch,n_q,n_k)

### output of 12 encoder layer

In [39]:
# embdding layer
embedding = Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)

# embedding output
output = embedding(input_ids, segment_ids)
print(f'embedding shape:{output.shape}')

# padding of the key, query , value
enc_self_attn_mask = get_attn_pad_mask(seq_q=input_ids,seq_k=input_ids)

# 6 encoder layers
layers = nn.ModuleList([EncoderLayer(d_model=d_model,d_k=d_k,n_heads=n_heads,d_v=d_v,d_ff=d_ff) for _ in range(n_layers)])

# ouput of encoder layers
# inputs are output : embedding ouput , enc_self_attn_mask = padding mask of sentences
output_list = []
for layer in layers:
    output,enc_self_attn = layer(output,enc_self_attn_mask)
    output_list.append([output,enc_self_attn])

print(output.shape,enc_self_attn.shape)
print(f"ouput of encoder : {output.shape} ")
print(f"ouput of enc_self_attn : {enc_self_attn.shape} ")


embedding shape:torch.Size([6, 30, 768])
torch.Size([6, 30, 768]) torch.Size([6, 12, 30, 30])
ouput of encoder : torch.Size([6, 30, 768]) 
ouput of enc_self_attn : torch.Size([6, 12, 30, 30]) 


## Encoder layer

In [40]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model,d_k,d_v,n_heads,d_ff):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads)
        self.pos_ffn = PoswiseFeedForwardNet(d_model=d_model,d_ff=d_ff)

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask,n_heads=n_heads,d_v=d_v,d_model=d_model,d_k=d_k) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

encorder_layer = EncoderLayer(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads,d_ff=d_ff)

In [41]:
# for single encoder

# embedding layer
embedding = Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)

# output of embedding
embedding_output = embedding(input_ids, segment_ids)
print(f'embedding output shape:{embedding_output.shape}')

# padding mask for the self attentiion 
enc_self_attn_mask = get_attn_pad_mask(seq_q=input_ids,seq_k=input_ids)

# whole 12 encoder layers
layers = nn.ModuleList([EncoderLayer(d_model=d_model,d_k=d_k,n_heads=n_heads,d_v=d_v,d_ff=d_ff) for _ in range(n_layers)])


output, enc_self_attn = encorder_layer(embedding_output,enc_self_attn_mask)

print(f"ouput of encoder : {output.shape} ")
print(f"ouput of enc_self_attn : {enc_self_attn.shape} ")

embedding output shape:torch.Size([6, 30, 768])
ouput of encoder : torch.Size([6, 30, 768]) 
ouput of enc_self_attn : torch.Size([6, 12, 30, 30]) 


In [42]:
# multi head attention layer
enc_self_attn = MultiHeadAttention(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads)

# position wise feed forward network
pos_ffn = PoswiseFeedForwardNet(d_model=d_model,d_ff=d_ff)

# embedding layer 
embedding_layer =  Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)

# embedding ouput
embedding_output = embedding_layer(input_ids,segment_ids)
print(f'embedding output shape: {embedding_output.shape}')

# input to multi_head attentions
# 1. enc_inputs = embedding output shape(n_batch*maxlen*d_model)
# 2. enc_self_attn_mask = padding mask of query , key with shape [n_batch * max_len , max_len]
enc_inputs = embedding_output
enc_self_attn_mask = get_attn_pad_mask(seq_q = input_ids,seq_k=input_ids)
enc_outputs, attn = enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask,n_heads=n_heads,d_v=d_v,d_model=d_model,d_k=d_k) # enc_inputs to same Q,K,V

print(f"ouput of encoder output : {enc_outputs.shape} ")


enc_outputs = pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]


print(f"ouput of encoder self attention mask : {enc_self_attn_mask.shape} ")
print(f"ouput of encoder output after position wise feed forward : {enc_outputs.shape} ")
print(f"length of query, key, value and maxlen are same:{len_q == len_k == maxlen}")


embedding output shape: torch.Size([6, 30, 768])
ouput of encoder output : torch.Size([6, 30, 768]) 
ouput of encoder self attention mask : torch.Size([6, 30, 30]) 
ouput of encoder output after position wise feed forward : torch.Size([6, 30, 768]) 
length of query, key, value and maxlen are same:True


## Multi Head Attention

In [43]:
# multi head attention layer
enc_self_attn = MultiHeadAttention(d_model=d_model,d_k=d_k,d_v=d_v,n_heads=n_heads)

# position wise feed forward network
pos_ffn = PoswiseFeedForwardNet(d_model=d_model,d_ff=d_ff)

# embedding layer 
embedding_layer =  Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)

# embedding ouput
embedding_output = embedding_layer(input_ids,segment_ids)
print(f'embedding output shape: {embedding_output.shape}')

# input to multi_head attentions
# 1. enc_inputs = embedding output shape(n_batch*maxlen*d_model)
# 2. enc_self_attn_mask = padding mask of query , key with shape [n_batch * max_len , max_len]
enc_inputs = embedding_output
enc_self_attn_mask = get_attn_pad_mask(seq_q = input_ids,seq_k=input_ids)
enc_outputs, attn = enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask,n_heads=n_heads,d_v=d_v,d_model=d_model,d_k=d_k) # enc_inputs to same Q,K,V

print(f"ouput of encoder output : {enc_outputs.shape} ")


enc_outputs = pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]


print(f"ouput of encoder self attention mask : {enc_self_attn_mask.shape} ")
print(f"ouput of encoder output after position wise feed forward : {enc_outputs.shape} ")
print(f"length of query, key, value and maxlen are same:{len_q == len_k == maxlen}")

embedding output shape: torch.Size([6, 30, 768])
ouput of encoder output : torch.Size([6, 30, 768]) 
ouput of encoder self attention mask : torch.Size([6, 30, 30]) 
ouput of encoder output after position wise feed forward : torch.Size([6, 30, 768]) 
length of query, key, value and maxlen are same:True


In [44]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model,d_k,d_v,n_heads):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask,n_heads,d_v,d_model,d_k):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask,d_k=d_k)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]
        

In [45]:
W_Q = nn.Linear(d_model, d_k * n_heads)
W_K = nn.Linear(d_model, d_k * n_heads)
W_V = nn.Linear(d_model, d_v * n_heads)

print(W_Q)
print(f"d_k : {d_k}")

Linear(in_features=768, out_features=768, bias=True)
d_k : 64


In [46]:
Q,K,V = enc_inputs , enc_inputs, enc_inputs
d_q = d_k = d_v  # 64 dimention of query , key , value 

In [47]:

# residual is simply the embedding ouput or encoding input

# batch_size is simply the n_batch
residual, batch_size = Q, Q.size(0)

# 
q_s = W_Q(Q)
print(f"shape of q_s: {q_s.shape}")

q_s = q_s.view(batch_size,-1,n_heads,d_k)
print(f"shape of q_s: {q_s.shape}")

q_s = q_s.transpose(1,2) # initially dim : 0 -> batch , 1 -> maxlen , 2 -> n_heads , 3 -> d_k
# here 1 and 2 are enterchanged for making such that each head contains the all the words with d_k number of embedding

# after transposing dim:  0 -> batch , 1 -> n_heads , 2 -> len_q, maxlen , 3 -> d_k

print(f"shape of q_s: {q_s.shape}")

v_s = W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

print(f"shape of v_s: {v_s.shape}")


k_s = W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
v_s = W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]


shape of q_s: torch.Size([6, 30, 768])
shape of q_s: torch.Size([6, 30, 12, 64])
shape of q_s: torch.Size([6, 12, 30, 64])
shape of v_s: torch.Size([6, 12, 30, 64])


*purpose of linear layer*

In [48]:
input = torch.rand((1,2,3,4,768))

# linear only cares about the final axis element
# and takes each element as input for input layer of linear
# and ouput is the same shape execpt the last axis has increased or decreased length of axis 
linear_layer =  nn.Linear(in_features=768,out_features=2)
linear_out = linear_layer(input)

input.shape , linear_out.shape



(torch.Size([1, 2, 3, 4, 768]), torch.Size([1, 2, 3, 4, 2]))

In [49]:
attn_mask = get_attn_pad_mask(seq_q=input_ids,seq_k = input_ids)

print(f"shape of attention mask : {attn_mask.shape}") # shape(n_bath,max_len,max_len) # may be for query and value length

# adding extra dimention for the single head
attn_mask = attn_mask.unsqueeze(dim = 1) # initially : shape(n_batch,max_len,max_len) 
# aftter : shape(n_batch,1-head,maxlen,maxlen)

print(f"shape of attn_mask : {attn_mask.shape}")

# INFO: Parameters
# sizes (torch.Size or int...) – The number of times to repeat this tensor along each dimension

# after that repeating the same row for making the n_heads or 12 heads
attn_mask = attn_mask.repeat(1, n_heads, 1, 1) 
# here repeat(
# 1 -> for single repeat or no change
# n_heads -> for n_heads times
# )


print(f"shape of attn_mask : {attn_mask.shape}")



shape of attention mask : torch.Size([6, 30, 30])
shape of attn_mask : torch.Size([6, 1, 30, 30])
shape of attn_mask : torch.Size([6, 12, 30, 30])


### scaled dot product

In [50]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask,d_k):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt(torch.tensor(d_k,dtype=torch.float32)) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

In [51]:
test_1 = torch.rand((1,2,3,4))

print(f"shape of tensor: {test_1.shape}")

test_2 = test_1.transpose(dim0=-2,dim1=-1)

print(f"after transpose shape of tensor: {test_2.shape}")


shape of tensor: torch.Size([1, 2, 3, 4])
after transpose shape of tensor: torch.Size([1, 2, 4, 3])


In [52]:
Q = q_s # shape(n_batch,n_heads,maxlen,d_q)
K = k_s 
V = v_s
# batch matrix multiplication is performed since (dimension of at least 1 tensor is greater than 2 see rule for that)
# https://pytorch.org/docs/stable/generated/torch.matmul.html

print(f"shape of Q , K , V : {Q.shape,K.shape,V.shape}")

# here matrix multiplication is between the (maxlen and d_q) and (d_k and maxlen) dimention rest are unaffected
# dim: maxlen -> number of words in sentenc 
# dim: d_k -> length of reduced key dimention from the input_ids embedding  
scores = torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt( torch.tensor(d_k,dtype=torch.float32)) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
print(f"shape of scores:{scores.shape}")
print(f"shape of attn_mask : {attn_mask.shape}")
print(f"before masking: {scores[0,0,0]}")
scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
print(f"after masking : {scores[0,0,0]}")

# applying soft mask in last dimention
attn = nn.Softmax(dim=-1)(scores)

# finally context of each word is calculated with matrix multiplication 
# for finding out the which word are more important in that context
# first matrix multiplication helps to understand the which word is important and affect the result most as also the called attention filter
# and send matrix multiplication helps to apply that attention filter just like convolutional neural network to the actual sentence or (embedding inputs)
context = torch.matmul(attn, V)
print(f"shape of context : {context.shape} ")

shape of Q , K , V : (torch.Size([6, 12, 30, 64]), torch.Size([6, 12, 30, 64]), torch.Size([6, 12, 30, 64]))
shape of scores:torch.Size([6, 12, 30, 30])
shape of attn_mask : torch.Size([6, 12, 30, 30])
before masking: tensor([-0.2807,  0.0675,  0.5605, -0.5441, -0.1867,  0.0929, -0.1006, -0.0157,
        -0.5089, -0.4573, -0.2695, -0.2615, -0.3499,  0.2060, -0.0142,  0.1995,
         0.0128, -0.2257,  0.2191, -0.1056, -0.3517, -0.0495, -0.0107,  0.2420,
        -0.1038,  0.1172,  0.0740, -0.0520,  0.1095,  0.4185],
       grad_fn=<SelectBackward0>)
after masking : tensor([-2.8068e-01,  6.7542e-02,  5.6048e-01, -5.4405e-01, -1.8671e-01,
         9.2885e-02, -1.0061e-01, -1.5747e-02, -5.0886e-01, -4.5725e-01,
        -2.6954e-01, -2.6154e-01, -3.4991e-01,  2.0605e-01, -1.4227e-02,
         1.9947e-01,  1.2781e-02, -2.2565e-01,  2.1912e-01, -1.0557e-01,
        -3.5168e-01, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0

In [53]:
v_s.shape,attn.shape

(torch.Size([6, 12, 30, 64]), torch.Size([6, 12, 30, 30]))

--------------------------------------------------------------------------------------------------------------------------------------
completed the scaled dot product

### remaining multi head attention

In [54]:
attn_mask = get_attn_pad_mask(seq_q=input_ids,seq_k = input_ids)

print(f"shape of attention mask : {attn_mask.shape}") # shape(n_bath,max_len,max_len) # may be for query and value length

# adding extra dimention for the single head
attn_mask = attn_mask.unsqueeze(dim = 1) # initially : shape(n_batch,max_len,max_len) 
# aftter : shape(n_batch,1-head,maxlen,maxlen)

print(f"shape of attn_mask : {attn_mask.shape}")

# INFO: Parameters
# sizes (torch.Size or int...) – The number of times to repeat this tensor along each dimension

# after that repeating the same row for making the n_heads or 12 heads
attn_mask = attn_mask.repeat(1, n_heads, 1, 1) 
# here repeat(
# 1 -> for single repeat or no change
# n_heads -> for n_heads times
# )


print(f"shape of attn_mask : {attn_mask.shape}")


# context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask,d_k=d_k)

# context shape(n_batch,n_head,maxlen,d_k) , attention shape(n_batch,n_head,n_q,n_k)
print(f"shape of context, attention: {context.shape,attn.shape}")

context = context.transpose(1,2) # n_head and n_q transpose to make shape(n_batch,n_q,n_head,d_v)
print(f"shape of context after transpose: {context.shape}")


context = context.contiguous() 
print(f"shape of context after contiguous: {context.shape}")

# finally we return the context that gives the whole view sentence 
context = context.view(batch_size,-1,n_heads*d_v)

# context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]

print(f"shape of context: {context.shape}")

output = nn.Linear(n_heads * d_v, d_model)(context)

print(f"linear output shape : {output.shape}")

# layer normalization across the axis of feature it's sentence or maxlen or len_q dimention
output = nn.LayerNorm(d_model)(output + residual) # output: [batch_size x len_q x d_model]

print(f"normalized  shape : {output.shape}")


shape of attention mask : torch.Size([6, 30, 30])
shape of attn_mask : torch.Size([6, 1, 30, 30])
shape of attn_mask : torch.Size([6, 12, 30, 30])
shape of context, attention: (torch.Size([6, 12, 30, 64]), torch.Size([6, 12, 30, 30]))
shape of context after transpose: torch.Size([6, 30, 12, 64])
shape of context after contiguous: torch.Size([6, 30, 12, 64])
shape of context: torch.Size([6, 30, 768])
linear output shape : torch.Size([6, 30, 768])
normalized  shape : torch.Size([6, 30, 768])


In [55]:
d_model == n_heads * d_v

True

---------------------------------------------------------------------------------------------------------------------------------------
multi head attention completed

# Position Wise Feed Forward Network

In [56]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self,d_model,d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(gelu(self.fc1(x)))

In [57]:
d_ff,d_model


(3072, 768)

In [58]:
poswise_feed_forward = PoswiseFeedForwardNet(d_model=d_model,d_ff=d_ff)

output = poswise_feed_forward(output)

print(f"final output shape: {output.shape}")

final output shape: torch.Size([6, 30, 768])


# Extra layers of bert

In [59]:
batch = make_batch(batch_size=batch_size,sentences=sentences,token_list=token_list,word_dict=word_dict,max_pred=max_pred,maxlen=maxlen,vocab_size=vocab_size,number_dict=number_dict)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))



# embdding layer
embedding = Embedding(vocab_size=vocab_size,d_model=d_model,maxlen=maxlen,n_segments=n_segments)

# embedding output
output = embedding(input_ids, segment_ids)
print(f'embedding shape:{output.shape}')

# padding of the key, query , value
enc_self_attn_mask = get_attn_pad_mask(seq_q=input_ids,seq_k=input_ids)

# 6 encoder layers
layers = nn.ModuleList([EncoderLayer(d_model=d_model,d_k=d_k,n_heads=n_heads,d_v=d_v,d_ff=d_ff) for _ in range(n_layers)])

# ouput of encoder layers
# inputs are output : embedding ouput , enc_self_attn_mask = padding mask of sentences
output_list = []
for layer in layers:
    output,enc_self_attn = layer(output,enc_self_attn_mask)
    output_list.append([output,enc_self_attn])

print(output.shape,enc_self_attn.shape)
print(f"ouput of encoder : {output.shape} ")
print(f"ouput of enc_self_attn : {enc_self_attn.shape} ")
# -------------------------------------------------------------
fc = nn.Linear(d_model, d_model)
activ1 = nn.Tanh()
linear = nn.Linear(d_model, d_model)
activ2 = gelu
norm = nn.LayerNorm(d_model)
classifier = nn.Linear(d_model, 2)
# decoder is shared with embedding layer


embed_weight = embedding.tok_embed.weight
print(f"embedding weight shape : {embed_weight.shape}")
n_vocab, n_dim = embed_weight.size()
print(f"n_vocab ,n_dim : {n_vocab,n_dim}")

# making linear layer for decoder which maps
# n_dim -> n_vocab
decoder = nn.Linear(n_dim, n_vocab, bias=False)

# decoder weight is not learned but asigned with the token embedding
# so that it uses the pupose of mapping instead of learning eachtime
decoder.weight = embed_weight

is_share_same = decoder.weight.storage().data_ptr() == embed_weight.storage().data_ptr() 
print(f"decoder weight and embedding weight share same memory: {is_share_same}")
# using decoder bias as 0
decoder_bias = nn.Parameter(torch.zeros(n_vocab))

# ---------------------------forward ----------------------
# output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_model, d_model]
# it will be decided by first token(CLS)
print(f"shape of output: {output.shape}")

# getting first column of all n_batch , with first word and all d_model
print(f"shape of output[:,0] : {output[:,0].shape}")

h_pooled = activ1(fc(output[:, 0])) # [batch_size, d_model]
print(f" shape of h_pooled : {h_pooled.shape}")

# taking only  2 for classification purpose
logits_clsf = classifier(h_pooled) # [batch_size, 2]
print(f" shape of logits_clsf : {logits_clsf.shape}")
print(f"logits_clsf : {logits_clsf}")

# -------------------------------masked_pos --------------------------



masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
print(f" shape of masked_pos: {masked_pos.shape}")
# get masked position from final output of transformer.
print(f"shape of final output: {output.shape}")
print(f"output : {output}")
print(f"masked_pos : {masked_pos}")

# https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms
h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
print(f"h_masked : {h_masked}")
print(f"shape of h_masked : {h_masked.shape}")
# h_masked contains the masked value of all the positions that is masked 
# returns only the corresponding max_prediction with d_model dimention since each mask dimention is to be masked


h_masked = norm(activ2(linear(h_masked)))

# -----------------------------------------masked position prediction ------------------------
logits_lm = decoder(h_masked) + decoder_bias # [batch_size, max_pred, n_vocab]
print(f"shape of logits_lm : {logits_lm.shape}")

# logits_lm and logits_clsf are returned

print(f"logits_lm : {logits_lm[0,0,:10]}")

embedding shape:torch.Size([6, 30, 768])
torch.Size([6, 30, 768]) torch.Size([6, 12, 30, 30])
ouput of encoder : torch.Size([6, 30, 768]) 
ouput of enc_self_attn : torch.Size([6, 12, 30, 30]) 
embedding weight shape : torch.Size([29, 768])
n_vocab ,n_dim : (29, 768)
decoder weight and embedding weight share same memory: True
shape of output: torch.Size([6, 30, 768])
shape of output[:,0] : torch.Size([6, 768])
 shape of h_pooled : torch.Size([6, 768])
 shape of logits_clsf : torch.Size([6, 2])
logits_clsf : tensor([[ 0.0304, -0.0858],
        [ 0.0342, -0.0787],
        [ 0.0343, -0.0887],
        [ 0.0342, -0.0859],
        [ 0.0475, -0.0715],
        [ 0.0333, -0.0792]], grad_fn=<AddmmBackward0>)
 shape of masked_pos: torch.Size([6, 5, 768])
shape of final output: torch.Size([6, 30, 768])
output : tensor([[[ 0.1532, -0.1147,  0.1736,  ...,  0.0703, -0.0467,  0.0740],
         [ 0.0466, -0.0760,  0.0523,  ...,  0.2266,  0.0851,  0.1634],
         [ 0.1544, -0.1170,  0.0933,  ...,  0.29

In [60]:
# criterain

# Criterian

In [61]:
batch = make_batch(batch_size=batch_size,sentences=sentences,token_list=token_list,word_dict=word_dict,max_pred=max_pred,maxlen=maxlen,vocab_size=vocab_size,number_dict=number_dict)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))


# criterion
model = BERT(d_model=d_model,n_layers=n_layers,vocab_size=vocab_size,d_k=d_k,d_v=d_v,n_heads=n_heads,d_ff=d_ff)

logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
logits_lm_tran = logits_lm.transpose(1,2)
masked_tokens = masked_tokens

print(f"logits_lm transpose : {logits_lm_tran.shape} ")
# logits_lm transpose : shape( n_batch , vocab_position , mask_pos_output)

print(f"masked_tokens shape : {masked_tokens.shape}")

print(f"logits_lm_tran : {logits_lm_tran}")
# token means vocab position
print(f"masked_tokens : {masked_tokens}")

loss_lm = criterion(logits_lm_tran, masked_tokens) # for masked LM

loss_lm = (loss_lm.float()).mean()

print(f"loss_lm before mean: {loss_lm}")

print(f"loss_clsf logits : {logits_clsf.shape}")
print(f"isNext : {isNext.shape}")

loss_clsf = criterion(logits_clsf, isNext) 

# objective function of loss
loss = loss_lm + loss_clsf

loss.backward()



logits_lm transpose : torch.Size([6, 29, 5]) 
masked_tokens shape : torch.Size([6, 5])
logits_lm_tran : tensor([[[-2.0260e+01, -2.8884e+01, -3.2608e+01,  3.5535e+00,  3.5535e+00],
         [-5.3974e+01, -2.5572e+01, -5.0938e+01, -3.4439e+01, -3.4439e+01],
         [ 1.6337e+01,  2.0226e+01,  2.4240e+01, -1.3174e+01, -1.3174e+01],
         [ 5.3234e+01,  2.2370e+01,  5.1465e+01,  6.0208e+01,  6.0208e+01],
         [-4.0959e+01, -2.9181e+01, -2.9680e+01, -4.8465e+01, -4.8465e+01],
         [ 2.6439e+00,  1.1836e+01,  4.2148e+00,  1.6693e+01,  1.6693e+01],
         [ 1.4258e+01,  4.1194e+01,  4.6380e+01,  2.1247e+01,  2.1247e+01],
         [ 4.1309e-02,  2.2970e+01,  1.7018e+01, -6.1084e-02, -6.1084e-02],
         [-1.8357e+01,  8.7911e+00,  3.0643e+01, -2.8595e+00, -2.8595e+00],
         [ 5.9858e+01,  6.6744e+01,  5.9283e+01,  2.4697e+01,  2.4697e+01],
         [ 1.9206e+01,  3.0109e+01,  2.1948e+01,  2.5134e+01,  2.5134e+01],
         [ 3.3579e+01, -1.6922e+00,  3.5415e+01,  1.6127e+01

## prediction


In [126]:

# Predict mask tokens ans isNext
# passing first sentence on;
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
print(text)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)

#-------------------------------------
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()

# obtaining the masked token list and ignoring the "0" valued tokens
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])

# obtraining the masked token from the prediction and ignoring the "0" value tokens
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

# obtraining the classification prediction label 
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()
print(f"logits_clsf : {logits_clsf}")
logits_clsf = logits_clsf[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thanks you Romeo
['[CLS]', '[MASK]', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]', 'great', 'my', 'baseball', 'team', 'won', '[MASK]', 'competition', '[SEP]']
masked tokens list :  [15, 11, 28]
predict masked tokens list :  [22, 3, 22, 21, 21]
logits_clsf : [1]
isNext :  True
predict isNext :  True


In [125]:
# what is happening to the logits_lm
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
# print(text)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])


logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)

print(f"masked lm shape: {logits_lm.shape}")

# accessing the 2nd row of max of 29 vocab token from the row of 5 see how max works
logits_lm = logits_lm.data.max(2)[1]
print(f"masked lm shape: {logits_lm.shape}")

# accessing the 1st row from batches
logits_lm = logits_lm[0]
print(f"masked lm shape: {logits_lm.shape}")




['[CLS]', '[MASK]', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]', 'great', 'my', 'baseball', 'team', 'won', '[MASK]', 'competition', '[SEP]']
masked lm shape: torch.Size([1, 5, 29])
masked lm shape: torch.Size([1, 5])
masked lm shape: torch.Size([5])


In [111]:
# how max behaves 

test1 = [
    [
        [1,2,3],
        [2,4,0],
        [100,2,7],
    ],
    [
        [10,100,23],
        [23,30,20],
        [34,30,43],
    ]
]

dim0 =  [# v v v row 1 #  v v v row 2#  v v v row 3 indexing this way and also scanning this way
    [[1,2,3],[2,4,0],[100,2,7],],
    [[10,100,23],[23,30,20],[34,30,43],]
]

dim1 = [
    [  # V V V  1st scan and store in 1st row of 2d matrix and indexing is done top to bottom
        [1,2,3],
        [2,4,0],
        [100,2,7],
    ],
    [ # v v v    2nd scan and store in 2nd row of 2d matrix
        [10,100,23],
        [23,30,20],
        [34,30,43],
    ]
]

dim2 = [
    [ 
        [1,2,3], # <
        [2,4,0], # <    scan this way and store in 1st row of 2d matrix and index is done left to right
        [100,2,7], # <
    ],
    [
        [10,100,23], # <
        [23,30,20],  # <  scan this way and store in 2nd row of 2d matrix
        [34,30,43],  # < 
    ]
]

dims = {
    'dim0': dim0,
    'dim1': dim1,
    "dim2": dim2
}
for i in range(3):
    dimi = f'dim{i}'
    dimi = dims[dimi]

    dimi = torch.tensor(dimi)
    
    dimi_max = dimi.max(i)
    print(f"dim{i} max in dimension {i} : {dimi_max} \n")
    



dim0 max in dimension 0 : torch.return_types.max(
values=tensor([[ 10, 100,  23],
        [ 23,  30,  20],
        [100,  30,  43]]),
indices=tensor([[1, 1, 1],
        [1, 1, 1],
        [0, 1, 1]])) 

dim1 max in dimension 1 : torch.return_types.max(
values=tensor([[100,   4,   7],
        [ 34, 100,  43]]),
indices=tensor([[2, 1, 2],
        [2, 0, 2]])) 

dim2 max in dimension 2 : torch.return_types.max(
values=tensor([[  3,   4, 100],
        [100,  30,  43]]),
indices=tensor([[2, 1, 0],
        [1, 1, 2]])) 



In [116]:
for i in range(2,3):
    dimi = f'dim{i}'
    dimi = dims[dimi]

    dimi = torch.tensor(dimi)
    
    dimi_max = dimi.max(i)
    print(dimi_max[0][0])
    # print(f"dim{i} max in dimension {i} : {dimi_max} \n")

tensor([  3,   4, 100])


# masked token

In [62]:
# what  the masked_pos is doing?
batch = make_batch(batch_size=batch_size,sentences=sentences,token_list=token_list,word_dict=word_dict,max_pred=max_pred,maxlen=maxlen,vocab_size=vocab_size,number_dict=number_dict)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

print(f"masked pos: {masked_pos}")
print(f"masked_pos shape: {masked_pos.shape}")





masked_pos = masked_pos[:,:,None]
print(f"masked pos after unsqueeze: {masked_pos.shape}")
print(f"masked pos: {masked_pos}")

masked_pos = masked_pos.expand(-1,-1,output.size(-1))
print(f"masked pos after expanding: {masked_pos.shape}")
print(f"masked pos: {masked_pos}") 

# final shape of masked_pos : (batch_size,max_pred,d_model)

# masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]


masked pos: tensor([[ 2,  1, 15,  0,  0],
        [ 9,  4,  0,  0,  0],
        [ 3,  2,  0,  0,  0],
        [ 7,  0,  0,  0,  0],
        [ 7,  2,  5,  0,  0],
        [ 2,  1,  0,  0,  0]])
masked_pos shape: torch.Size([6, 5])
masked pos after unsqueeze: torch.Size([6, 5, 1])
masked pos: tensor([[[ 2],
         [ 1],
         [15],
         [ 0],
         [ 0]],

        [[ 9],
         [ 4],
         [ 0],
         [ 0],
         [ 0]],

        [[ 3],
         [ 2],
         [ 0],
         [ 0],
         [ 0]],

        [[ 7],
         [ 0],
         [ 0],
         [ 0],
         [ 0]],

        [[ 7],
         [ 2],
         [ 5],
         [ 0],
         [ 0]],

        [[ 2],
         [ 1],
         [ 0],
         [ 0],
         [ 0]]])
masked pos after expanding: torch.Size([6, 5, 768])
masked pos: tensor([[[ 2,  2,  2,  ...,  2,  2,  2],
         [ 1,  1,  1,  ...,  1,  1,  1],
         [15, 15, 15,  ..., 15, 15, 15],
         [ 0,  0,  0,  ...,  0,  0,  0],
         [ 0,  0, 

In [63]:
# -------------------------------------------------test ---------------------------------------------------------
##--------------------------example------------------------------- ##
# https://stackoverflow.com/questions/69797614/indexing-a-tensor-with-none-in-pytorch
test = torch.rand((2,3,5,7))

print(f"shape before: {test.shape}")
test =test[:,:,None,:,:]

print(f"shape after : {test.shape}")


shape before: torch.Size([2, 3, 5, 7])
shape after : torch.Size([2, 3, 1, 5, 7])


In [64]:
t = torch.tensor([[1, 2], [3, 4]])
test = torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
# tensor([[ 1,  1],
#         [ 4,  3]])

# [
#     [ 1 , 2 ]
#     [ 3 , 4]
# ]

In [65]:
input = torch.tensor(
    [
        [1, 2],
        [3, 4],
    ]
    
    )
index = torch.tensor([(0,0),(0,1),(1,0),(1,1)])
for (i,j) in index:
    print(t[i,j])


tensor(1)
tensor(2)
tensor(3)
tensor(4)


In [66]:
# out[i][j] = input[index[i][j]][j]  # if dim == 0
# out[i][j] = input[i][index[i][j]]  # if dim == 1


In [67]:
# out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
# out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
# out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2

In [68]:
t.shape

torch.Size([2, 2])

In [69]:
# for gathering of dim = 0 we need of index (i)
# for gathering of dim = 1 we need index of (i,j)

In [70]:
# axis = 1
print(
t[0][0],
t[0][1],
t[1][0],
t[1][1],
)
# axis = 0
print(
    t[0],
    t[1]
)

tensor(1) tensor(2) tensor(3) tensor(4)
tensor([1, 2]) tensor([3, 4])


In [71]:
test = [
    [
        [0,1,2],
        [3,4,5],
        [6,7,8],
    ],
    [
        [9,10,11],
        [12,13,14],
        [15,16,17],
    ],
    [
        [18,19,20],
        [21,22,23],
        [24,25,26],
    ]
]
test = torch.tensor(test)
test.shape

torch.Size([3, 3, 3])

In [72]:
# dim 0
print(
    test[0],
    test[1],
    test[2]
)

# for accessing the item in dim=0 you need 2 (i,j) more indices
#----------------------------------------------------
# dim 1
print(f"\n")
print(
    test[0,0],
    test[0,1],
    test[0,2],
)

# for dim = 1 you need  1 (i) indices more 

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]) tensor([[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]]) tensor([[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]])


tensor([0, 1, 2]) tensor([3, 4, 5]) tensor([6, 7, 8])


In [73]:
embedding = nn.Embedding(30,768)

embedding_weights = embedding.weight
print(f"vocab size : {vocab_size}")
print(f"embedding weights shape : {embedding_weights.shape}")

vocab size : 29
embedding weights shape : torch.Size([30, 768])
