# Using GPT2 for text generation

(Kernel used: Colab)

More infos in https://machinelearningmastery.com/text-generation-with-gpt-2-model/

In [8]:
!pip install -q tensorflow==2.19.0

[0m

In [9]:
!pip install -q transformers

In [12]:
import tensorflow as tf 
import transformers 
from transformers import GPT2LMHeadModel, GPT2Tokenizer



In [13]:
# loading the tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [14]:
type(gpt2_tokenizer)

transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer

In [18]:
print(gpt2_tokenizer)

GPT2Tokenizer(name_or_path='gpt2-large', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)


In [19]:
gpt2_tokenizer.eos_token_id

50256

In [20]:
# loading the model
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=gpt2_tokenizer.eos_token_id)

print(gpt2_model)

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)


In [55]:
input = "What is Quantum Mechanics?"

# tokenize the input
input_ids = gpt2_tokenizer.encode(
    input, return_tensors="pt"
)

In [56]:
input_ids

tensor([[ 2061,   318, 29082, 47570,    30]])

In [57]:
for id in input_ids[0]:
    print(gpt2_tokenizer.decode(id))

What
 is
 Quantum
 Mechanics
?


In [75]:
# generate text 
response = gpt2_model.generate(
    input_ids, 
    max_length=100, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [76]:
print(response)

tensor([[ 2061,   318, 29082, 47570,    30,   198,   198, 24915,   388, 12933,
           318,   257,  8478,   286, 11887,   326,  7529,   351,   262,  3450,
           286,  2300,   290,  2568,    13,   632,   318,  1912,   319,   262,
          2126,   326,   477,  2300,   318,   925,   510,   286,   850, 47116,
         13166,  1444,   627,  5558,   290,   443,   457,   684,    13,  2312,
         13166,  9427,   351,  1123,   584,   287,   257,   835,   326,   460,
           307,  3417,   416,   257, 18069, 10451,  1444,   257,  6769,  2163,
            11,   543,  8477,   262,   835,   287,   543,   262, 13166,  1445,
           832,  2272,   290,   640,    13,   770,   318,   262,  4308,   286,
           477,  3660, 11887,    11,  1390,   262,  3657,   286, 21969, 44124]])


In [77]:
print(response[0])

tensor([ 2061,   318, 29082, 47570,    30,   198,   198, 24915,   388, 12933,
          318,   257,  8478,   286, 11887,   326,  7529,   351,   262,  3450,
          286,  2300,   290,  2568,    13,   632,   318,  1912,   319,   262,
         2126,   326,   477,  2300,   318,   925,   510,   286,   850, 47116,
        13166,  1444,   627,  5558,   290,   443,   457,   684,    13,  2312,
        13166,  9427,   351,  1123,   584,   287,   257,   835,   326,   460,
          307,  3417,   416,   257, 18069, 10451,  1444,   257,  6769,  2163,
           11,   543,  8477,   262,   835,   287,   543,   262, 13166,  1445,
          832,  2272,   290,   640,    13,   770,   318,   262,  4308,   286,
          477,  3660, 11887,    11,  1390,   262,  3657,   286, 21969, 44124])


In [78]:
response_tensor = response[0]

In [80]:
from pprint import pprint 

pprint( gpt2_tokenizer.decode(response_tensor, skip_special_tokens=True) )

('What is Quantum Mechanics?\n'
 '\n'
 'Quantum mechanics is a branch of physics that deals with the nature of '
 'matter and energy. It is based on the idea that all matter is made up of '
 'subatomic particles called quarks and leptons. These particles interact with '
 'each other in a way that can be described by a mathematical formula called a '
 'wave function, which describes the way in which the particles move through '
 'space and time. This is the basis of all modern physics, including the laws '
 'of thermodynamics')
