In [21]:
import torch
import tiktoken
import sys
sys.path.append('../')
from modular.GPTArchitecture.GPTModel import GPTModel
from modular.GPTArchitecture.TextGeneration import generate_text_simple

In [2]:
GPT_CONFIG_124M = {
 "vocab_size": 50257,      # Vocabulary size
 "context_length": 256,   # Context length
 "emb_dim": 768,           # Embedding dimension
 "n_heads": 12,            # Number of attention heads
 "n_layers": 12,           # Number of layers
 "drop_rate": 0.1,         # Dropout rate
 "qkv_bias": False         # Query-Key-Value bias
}

Note: GPT2 uses context_length = 1024, but for the purpose of easy calculations, we will set it to 256.

In [3]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [4]:
def text_to_token_ids(text,tokenizer):
    encoded = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [5]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding('gpt2')

In [6]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context,tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M['context_length']
)

In [7]:
print("Output text:\n",token_ids_to_text(token_ids,tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


calculating the text generation loss: cross-entropy and perplexity

In [8]:
inputs = torch.tensor([[16833, 3626, 6100], # ["every effort moves",
 [40, 1107, 588]])
targets = torch.tensor([[3626, 6100, 345 ], # [" effort moves you",
 [1107, 588, 11311]]) # " really like chocolate"]

In [9]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits,dim=-1) # probability of each token in vocabulary
print(probas.shape) #shape: (batch_size,num_tokens,vocab_size)

torch.Size([2, 3, 50257])


In [10]:
token_ids = torch.argmax(probas,dim=-1,keepdim=True)
print("Token Ids:\n",token_ids)

Token Ids:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [11]:
token_ids.shape

torch.Size([2, 3, 1])

if we decode these tokens, we find that these are quite different from the tokens we want the model to predict.That's because the model wasn't trained yet.

In [12]:
print(f"Targets batch 1: {token_ids_to_text(targets[0],tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(),tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


Cross-entrpy loss

In [13]:
text_idx = 0
target_probas_1 = probas[text_idx,[0,1,2],targets[text_idx]]
print("Text 1:",target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx,[0,1,2],targets[text_idx]]
print("Text 2:",target_probas_2)

Text 1: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


We want to maximize all these values, bringing them close to a probability of 1. 
* In mathematical optimization, it is easier to maximize the logarithm of the probability score.

In [14]:
# compute logarithm of all token probabilities
log_probas = torch.log(torch.cat((target_probas_1,target_probas_2)))
print(log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


Next we will compute the average log probability

In [15]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.7940)


The goal is to make this average log probability as large as possible by optimizing the model weights.
* Instead of maximizing the average log-probability, it's a standard convention to minimize it so that it approaches 0. 

In [16]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.7940)


PyTorch already implements a cross_entropy function that carries out the previous steps
* Before we apply the cross_entropy function, let's check the shape of the logits and targets

In [17]:
# Logits have shape (batch_size,num_tokens,vocab_size)
print("Logits shape:" ,logits.shape)

#Targets have shape (batch_size,num_tokens)
print("Targets shape:",targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


To use the cross_entropy function of PyTorch, we want to flatten these tensors by flattening them

In [18]:
logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()

print("Flattened logits:",logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [19]:
loss = torch.nn.functional.cross_entropy(logits_flat,targets_flat)
print(loss)

tensor(10.7940)


Perplexity :
* measures how well the probability distribution predicted by the model matches the actual distribution in the dataset
* More interpretable way of understanding model uncertainty in predicting next token
* Lower perplexity = better predictions
* preplexity = torch.exp(loss)

In [20]:
perplexity = torch.exp(loss)
perplexity

tensor(48725.8203)

This means the model is roughly as uncertain as if it had to choose the next token randomly from about 48725 tokens in the vocabulary