In [22]:
#https://huggingface.co/docs/transformers/index

from transformers import AutoTokenizer

from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

#set a checkpoint to use bert-base-uncased model
model_chkpt = 'bert-base-uncased'

#create a tokenizer from the checkpoint model
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)

text = "The invention of the combustion engine revolutionized travel and many other fields."

model = BertModel.from_pretrained(model_chkpt)

# Tokenize input
#This process breaks text down into individual words using the pretrained tokenizer from bbu
show(model, 'bert', tokenizer, text, display_mode = 'light', layesentence_ar=0, head=8)
#visualizes the model, using the tokenizer(bbu)

#this just retokenizes the text and puts it into an array
tokenized_text = tokenizer.tokenize(text)
#print(tokenized_text)
#['the', 'invention', 'of', 'the', 'combustion', 'engine', 'revolution', '##ized', 'travel', 'and', 'many', 'other', 'fields', '.']

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

['the', 'invention', 'of', 'the', 'combustion', 'engine', 'revolution', '##ized', 'travel', 'and', 'many', 'other', 'fields', '.']


Tokenization

In [27]:
#This gives each word in the tokenized array an index in a vocabulary (and removes special characters like punctuation)
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
inputs.input_ids


tensor([[ 1996, 11028,  1997,  1996, 16513,  3194,  4329,  3550,  3604,  1998,
          2116,  2060,  4249,  1012]])

In [28]:
from transformers import AutoConfig
from transformers import AutoModel

from torch import nn

#config is the hidden values from the bbu model
config = AutoConfig.from_pretrained(model_chkpt)

#config

In [29]:
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [30]:
inputs_embeds = token_emb(inputs.input_ids)

# print(inputs_embeds)
# inputs_embeds.size()

#torch.size([1, 14, 768])
#1: Batch size
#14: Sequence Length
#768: Hidden size

tensor([[[-2.4074e-02, -2.8508e-01, -5.6233e-01,  ...,  5.5820e-01,
          -7.7377e-01, -1.1474e+00],
         [-1.9737e-01, -1.2484e+00, -1.4142e+00,  ..., -3.0430e-01,
           7.3472e-01,  3.0365e-01],
         [ 2.2734e-02,  7.4401e-01, -2.1070e+00,  ...,  6.4836e-02,
          -2.1698e+00, -2.4897e-01],
         ...,
         [ 3.4730e-01, -2.8690e+00, -1.0073e+00,  ...,  1.7551e+00,
          -7.0168e-01,  1.8159e+00],
         [-5.7517e-01,  1.0288e-01, -2.5675e+00,  ...,  2.0472e+00,
           5.0618e-01, -4.5802e-01],
         [-3.8139e-01,  1.6929e+00,  6.6501e-03,  ..., -1.0269e+00,
           1.1527e+00, -1.7855e-03]]], grad_fn=<EmbeddingBackward0>)


torch.Size([1, 14, 768])

In [35]:
# create the query key and value vectors
# calculate attention scores using the dot product as the similarity function

import torch
from math import sqrt

query = key = value = inputs_embeds
# query is input, key is reference (list of potential matching youtube video descriptions, lengths, titles so on)
# value is final outputs 

# qkv vectors are generated by applying independent weight matrices
# for now are kept equal for simplcity
# in scaled dot product attention \/
#   dot products are scaled by the size of the embedding vectors (stops us from getting too many large numbers)

dim_k = key.size(-1)

scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)
# bmm used instead of matmul because bmm can multiply batches of matrices

scores.size()
# torch.Size([1, 14, 14])
# this generates a 14x14 matrix of attention scores
# the batch dimension is 1, basically a 2d matrix rather than 3d

torch.Size([1, 14, 14])

In [32]:
import torch.nn.functional as F

weights = F.softmax(scores, dim =1)

weights.sum(dim = -1)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]],
       grad_fn=<SumBackward1>)

In [34]:
# Multiply the attention weights by the values

attn_outputs = torch.bmm(weights, value)
# matrix multiply the weights vs the value vector 

#attn_outputs.size()
#torch.Size([1, 14, 768])
#same size as input_embeds

torch.Size([1, 14, 768])

Self attention is a fancy way to calculate the weighted average

Compares words to every other word in the input string and assigns weights to 
each word. Basically can reference the each word in the input previous/upcoming
when generating a response to each word individually

In [38]:
def scaled_dot_product_attention(query, key, value):
    # q, k, v are batches of matrices
    # each with a shape of batch size, sequence length, number of features
    dim_k = key.size(-1)
    scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)

    weights = F.softmax(scores, dim = -1)

    attn_outputs = torch.bmm(weights, value)
    return attn_outputs

#returns scaled dot product attention
#redid last few cells as a function

q k v are not the same in practice
because attention mechanizm when theyre equal, will assign a very large score to identicle words
and a dot product to itself is always 1

in practice, we use different weight matrices for each of the q k v vectors
the self attention layer applies 3 indipendent linear transformations to each embedding
to generate the vectors
this transofrmation projects the embeddings
and each projection carries its own set of learnable parameters
alows self attention layers to focus on different semantic aspects of the sequence
each is called an attention head (multiheaded attention)

In [39]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
        #applies nn.linear transformation to each
        
    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state),self.k(hidden_state),self.v(hidden_state)
        )
        return attn_outputs

# this initializes 3 independent linear layers that apply matrix multiplications to 
# the embedding vectors to produce tensors of shape batch size, sequence length, and head dimensions (number of dimensions we are projecting into)

In [40]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads #number of multiple heads
        #head_dim is embed_dim/hum_head because 
        #we are dividing the embedding vectors into 12 equal parts

        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, embed_dim)]
        )
        #module list is just like python list
        #but designed to store any desired number of nn.modules (whole class inherits from nn.module)
        #nn.module regular use case is for designing a nn whose number of layers is passed as an input

        self.output_linear = nn.Linear(embed_dim, embed_dim)
        #this is the final linear layer that combines the outputs of the attention heads
    
    def forward(self, hidden_state):
        #this is where we implement the multihead concept
        #concatenate and then pass concatenated output from the attention head
        #to be fed through the final linear head
        concatenated_output = torch.cat([h(hidden_state) for h in self.heads], dim = -1)
        concatenated_output = self.output_linear(concatenated_output)
        return concatenated_output

In [41]:
#pass in configuration loaded earlier from pretrained bert model when initiualizing 
#multihead attention model

multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
#attn_output.size()
#torch.Size([1, 14, 768])
#works just the same as BERT (massive w)

torch.Size([1, 14, 768])

In [43]:
from bertviz import head_view

from transformers import AutoModel

model = AutoModel.from_pretrained(model_chkpt, output_attentions = True)

sent1 = "As the aicraft becomes lighter, it flies higher in the air of lower density to maintain the same airspeed"
sent2 = "The corn fields are full of flies"

viz_inputs = tokenizer(sent1, sent2) # return_tensor='pt'

attention = model(**viz_inputs).attentions

sent2_start = (viz_inputs.token_type_ids == 0).sum(dim=1)

tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

head_view(attention, tokens, sent2_start, heads=[21])



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'list' object has no attribute 'size'