# Understanding GPT2

see: https://huggingface.co/openai-community/gpt2

In [1]:
import torch

In [2]:
model_id = "gpt2"

## Tokenization

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, clean_up_tokenization_spaces=True)

########## Let have a look at the vocabulary of GPT-2 ##########
vocab = tokenizer.get_vocab() #key=subword, value=token_id
print(type(vocab))
print(len(vocab))
vocab_as_list = [subword for subword, _ in vocab.items()]
tokens_as_list = [token_id for _, token_id in vocab.items()]
token_dict = {token_id:subword for subword, token_id in vocab.items()}

<class 'dict'>
50257


In [11]:
# vocab

In [5]:
prompt = "The theory of relativity is"

In [6]:
input_ids = tokenizer.encode(prompt)
input_ids

[464, 4583, 286, 44449, 318]

In [7]:
for token_id in input_ids:
    subword = tokenizer.decode(token_id)
    print(token_id, "\t->", f"'{subword}'")

464 	-> 'The'
4583 	-> ' theory'
286 	-> ' of'
44449 	-> ' relativity'
318 	-> ' is'


## Plain GPT2 
![without_language_model_head](./media/without_language_model_head.png)

In [8]:
from transformers import AutoModel
model = AutoModel.from_pretrained(model_id)

In [10]:
print(model)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [11]:
print("prompt: ", prompt)
input_ids = tokenizer.encode(prompt, return_tensors="pt")
print("tokens: ", tokenizer.encode(prompt))
with torch.no_grad():
    outputs = model(input_ids=input_ids, output_attentions=False)
last_hidden_state = outputs.last_hidden_state # returns the logits of last hidden state
last_hidden_state

prompt:  The theory of relativity is
tokens:  [464, 4583, 286, 44449, 318]


tensor([[[-0.0470, -0.0333, -0.1626,  ..., -0.1337, -0.0571, -0.1059],
         [-0.4281, -0.1663, -0.9736,  ...,  0.0973,  0.1560, -0.3451],
         [ 0.4888, -0.2045, -0.7373,  ..., -0.1467,  0.2325,  0.1179],
         [-0.3101, -0.0585, -0.5952,  ...,  0.4587,  0.2593, -0.4875],
         [-0.3415, -0.0268, -1.4781,  ...,  0.4598,  0.1627, -0.1709]]])

In [12]:
last_hidden_state.shape # batch, sequence, embedding dimension

torch.Size([1, 5, 768])

## Understanding Text Generation
For "next word prediction" put a language model head on top of GPT2: Use GPT2LMHeadModel, since this adds the language modeling head on top of GPT2Model.
![with_language_model_head](./media/with_language_model_head.png)

In [13]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained(model_id)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
prompt = "The theory of relativity is"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
with torch.no_grad():
    outputs = model(input_ids=input_ids)
logits = outputs.logits
print(logits.shape)
next_token_logits = logits[0, -1,:] #log probs
next_token_logits.shape

torch.Size([1, 5, 50257])


torch.Size([50257])

### Use Softmax to convert logits into probabilities

Softmax function applied to an n-dimensional input vector rescales them so that the elements of the n-dimensional output Tensor lie in the range $[0,1]$ and sum to 1.

Softmax is defined pointwise as:

$$Softmax(x_i) = \frac{exp(x_i)}{\sum_{j} exp(x_j)} $$

see: https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html


In [15]:
# convert the logits into probabilities
next_token_probs = torch.softmax(next_token_logits, -1)
print(next_token_probs.shape) # for each token in the vocabulary we get a probability

torch.Size([50257])


In [16]:
print("get the top 10 and put all together:")
print("prob","\t"," token")
print("---------------")
topk_next_tokens= torch.topk(next_token_probs, 10) 
topk_next_token_list = [(tokenizer.decode(idx), prob) for idx, prob in zip(topk_next_tokens.indices, topk_next_tokens.values)] 
for token, prob in topk_next_token_list:
    print(round(prob.item(),3),"\t",token)

get the top 10 and put all together:
prob 	  token
---------------
0.381 	  that
0.081 	  based
0.04 	  the
0.038 	  a
0.028 	  not
0.019 	  simple
0.013 	  to
0.01 	 ,
0.01 	  very
0.009 	  one


In [17]:
# Use argmax to get the index of the vector with the highest probability
index = torch.argmax(next_token_probs)
next_token = index.item()
print("index", next_token, "with probability", next_token_probs[next_token].item()) #index == token_id
tokenizer.decode(next_token)

index 326 with probability 0.38146549463272095


' that'

In [19]:
############# greedy search loop ###########
prompt = "The theory of relativity"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
max_new_tokens = 15
for i in range(max_new_tokens):
    outputs = model(input_ids=input_ids)
    next_token_logits = outputs.logits[0, -1,:]
    next_token_probs = torch.softmax(next_token_logits, -1)
    index = torch.argmax(next_token_probs)
    prompt = prompt + tokenizer.decode(index.item())
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

print(prompt)

The theory of relativity is that the speed of light is proportional to the distance between two points.


In [None]:
############# Exercise: Implement a beam search 
#
# see: https://en.wikipedia.org/wiki/Beam_search
#
#############

## Text Generation Strategies

- beam search
- top k
- top p
- temperature

see: https://lena-voita.github.io/nlp_course/language_modeling.html#generation_strategies

In [22]:
############# Generate Text ###########
# the generate method is dooing the looping
prompt = "The theory of relativity is"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
attention_mask = torch.ones_like(input_ids)
outputs = model.generate(input_ids, attention_mask=attention_mask,
                         pad_token_id = tokenizer.eos_token_id,
                         max_new_tokens=30, num_beams=2)
tokenizer.decode(outputs[0])

'The theory of relativity is based on the idea that the speed of light is proportional to the distance between two points on a sphere. The speed of light is proportional to the distance'

In [23]:
# Use temperature, ...
prompt = "The theory of relativity is"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
attention_mask = torch.ones_like(input_ids)
outputs = model.generate(input_ids, attention_mask=attention_mask,
                         pad_token_id = tokenizer.eos_token_id,
                         max_new_tokens=100, num_beams=4, 
                         temperature=2.0, do_sample=True,
                         repetition_penalty=10.0
                        )
tokenizer.decode(outputs[0])

"The theory of relativity is that when we measure the curvature in space as given by Einstein, and take account of its angular momentum with respect to the observer's point of view (i.e., where all two objects are stationary), one can say that an object has a velocity equal to 2E-3^7*2/4 * \\pi_{j=\\frac{t}{s}\\rightarrow S&S}(T). We have shown before how this law was already known for other celestial bodies such"