In [2]:
# This example is modified from

# https://huggingface.co/gpt2
# https://huggingface.co/blog/how-to-generate

In [3]:
# In google colab, make sure you install transformers
# uncomment the following line for first-time execution
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
Col

In [4]:
import torch
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM

set_seed(123)

In [5]:
# In colab, this automatically downloads gpt2 model from Hugginface.
# If you run this locally, you need to download gpt2 by yourself through vpn, and change to your local directory path.

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

# Method 1: Use Transformers Pipeline to direct generate new sentences

This utilizes pipeline tool, which can search for num_return_sequences of sentences. It's the easiest way to prompt and get response.

In [7]:
query_text = "Name a good tennis player."
generator = pipeline('text-generation', model='gpt2')
generator(query_text, max_length=100, num_return_sequences=5)

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Name a good tennis player. Don\'t be a pro. Don\'t be a good tennis player.\n\n2) Don\'t get married to a guy who takes you for granted and puts on his pajamas or something. Don\'t be a professional. Don\'t be a professional.\n\n3) He\'ll give you money. Maybe get a nice contract, but no money, no jobs, no kids, maybe nothing." -Ringo Starr (music video)\n\n\n4) You'},
 {'generated_text': 'Name a good tennis player.\n\nWe\'ve covered a number of people but I think we have a bit more to say about this subject as to the "proper" position for a tennis player. Is this really something to consider based on what you\'ve seen on TV? This isn\'t a question I would ever answer, but I do believe that one of the most important variables involved in a tennis player is the "proper" role of being a professional tennis player. The question becomes:'},
 {'generated_text': "Name a good tennis player. This is my top 5 list in the world, the reason I'm not getting top 5 listed in Eur

In [8]:
qa_model = pipeline("question-answering", model='gpt2')

question = "Who is the best tennis player?"
context = "I am a tennis fan. I think the person with most number of Grand Slams is the best player."
qa_model(question = question, context = context)



Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'score': 0.015065975487232208,
 'start': 37,
 'end': 54,
 'answer': ' with most number'}

Check out different mode you can choose, such as translation, question-answering, so on.
https://huggingface.co/transformers/v3.0.2/main_classes/pipelines.html

# Method 2: We let the model generate (forward) and decode back to words

The real code of "generate()" function is https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1351

You can read the following examples first, then check the code.

## Example 1: Question Answering

This is more flexible. For teaching purpose, this code lets you understand each step.

In [9]:
# tokenize input prompt
question_text = "What is 1+2?"
encoded_input = tokenizer(question_text, return_tensors='tf')
print(encoded_input)


{'input_ids': <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[2061,  318,  352,   10,   17,   30]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [10]:
# you can define generation args as a dictionary, or pass them manually in generate() function
# check https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py#L40

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 16,
    "num_return_sequences":10,
}

In [11]:
# decode the question to answer

model_1 = GPT2LMHeadModel.from_pretrained('gpt2')
model_2 = GPT2LMHeadModel.from_pretrained("danyaljj/gpt2_question_answering_squad2")

input_ids = tokenizer.encode("You are good at math. Q: What is one plus two ? A:", return_tensors="pt")
outputs = model_1.generate(input_ids)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
outputs = model_2.generate(input_ids)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))


Downloading (…)lve/main/config.json:   0%|          | 0.00/821 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated: You are good at math. Q: What is one plus two? A: One plus two is
Generated: You are good at math. Q: What is one plus two? A: two plus two


## Example 2: Text Generation

Firstly, read the API and understand the arguments of "generate()"

https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/text_generation#transformers.TFGenerationMixin.generate

In [12]:
# Important Argument
#   max_new_tokens -- length of output
#   num_return_sequences -- number of returned responses
#   use_cache -- use KV cache to speed inference, see next section

Greedy decoding -- choose most probable next-word

In [13]:
# decode the input to complete the text
model_3 = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

#model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt')
model_inputs = tokenizer('I enjoy playing', return_tensors='pt')

# generate 40 new tokens
greedy_output = model_3.generate(**model_inputs, max_new_tokens=2)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
print(model_inputs.keys())





Output:
----------------------------------------------------------------------------------------------------
I enjoy playing with my
dict_keys(['input_ids', 'attention_mask'])




## Beam Search to generate diverse sentences

In [14]:
# use beam search to generate several sentence candidates
# read https://huggingface.co/blog/how-to-generate for details

# activate beam search and early_stopping
beam_output = model_3.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    early_stopping=True,
    num_return_sequences=5,
    use_cache=True
)

print("Output:\n" + 100 * '-')
# print(beam_output)
for ix, out in enumerate(beam_output):
  print(ix, tokenizer.decode(out, skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
0 I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again."

"
1 I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again." "I don
2 I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again?"

"
3 I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again." "I'm
4 I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again."

The


In [15]:
# use beam search to generate several sentence candidates
# without repeatitive n-gram
model_inputs = tokenizer('I enjoy playing', return_tensors='pt')

# activate beam search and early_stopping
beam_output = model_3.generate(
    **model_inputs,
    max_new_tokens=10,
    num_beams=100,
    early_stopping=True,
    num_return_sequences=5,
    use_cache=True,
    no_repeat_ngram_size=2     # dont not allow similar 2-gram appear twice
)

print("Output:\n" + 100 * '-')
# print(beam_output)
for ix, out in enumerate(beam_output):
  print(ix, tokenizer.decode(out, skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
0 I enjoy playing a lot of video games, but I don't
1 I enjoy playing the game as much as I would like to play
2 I enjoy playing the game as much as I did when I was
3 I enjoy playing the game, but I don't think it's
4 I enjoy playing the game, but I don't want to play


 ## Tutorial: KV Cache

check KV cache option
https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py#L100

check generate() code https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1351

First, check the speed of using or not using KV Cache

In [16]:
import numpy as np
from time import time

t1 = time()
output_tensors_1 = model_3.generate(**model_inputs, use_cache=True, max_new_tokens=40, num_beams=5, num_return_sequences=5)
t2 = time()
output_tensors_2 = model_3.generate(**model_inputs, use_cache=False, max_new_tokens=40, num_beams=5, num_return_sequences=5)
t3= time()

print('Use KV Cache time:', np.round(t2-t1,2))
print('NOT USE KV Cache time is much longer:', np.round(t3-t2,2))



Use KV Cache time: 4.44
NOT USE KV Cache time is much longer: 17.63


The use_cache=True/False does not affect the output. The responses should be the same. Let's check.

In [17]:
responses_1 = tokenizer.batch_decode(output_tensors_1)
responses_2 = tokenizer.batch_decode(output_tensors_2)

# Check use_cache=True(R1) and False(R2) if they are the same
for response_1, response_2 in zip(responses_1, responses_2):
  print('R1:', response_1, '\n')
  print('R2:', response_2,'\n\n')


R1: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again."

" 

R2: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again."

" 


R1: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again." "I don 

R2: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again." "I don 


R1: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again?"

" 

R2: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'll ever be able to play it again?"

" 


R1: I enjoy playing the game, but I don't know if I'll ever be able to play it again."

"I don't know if I'

Check code and see what's the inference behavior of generate
Review lecture notes again.

Read [tutorial](https://r4j4n.github.io/blogs/posts/kv/) and [tutorial](https://mett29.github.io/posts/kv-cache/), understand that KV cache is at Attention level.

Turning cache on, each ATT layer will store previous attention all K variables as K_list and all V variables as V_list. Then each step appends newly computed K and V to the list.

In real code, you can check GPT-2 attention [code](https://github.com/huggingface/transformers/blob/ae093eef016533a3670561fa9e26addb42d446d1/src/transformers/models/gpt2/modeling_gpt2.py#L901) and see this behavior.

In [18]:
print(model_1)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


# HW Code Question: Implement your own Beam Search with Priority Queue

In [3]:
import queue

# initialize a pq
K = 30





# we generate a sentence with 10 new words, "I enjoy [word]*10"
max_len=10
model_inputs = tokenizer('I enjoy', return_tensors='pt')


# repeat the generation loop

for i in range(max_len):

  # use gpt to decode one word at a time, DO NOT MODIFY any argument
  out = model_3.generate(**model_inputs, use_cache=True, max_new_tokens=1,
                         num_beams=50, num_return_sequences=K, output_scores=True,
                         return_dict_in_generate=True)
  sentences = out.sequences
  sent_scores = out.sequences_scores
  responses = tokenizer.batch_decode(sentences)
  print(responses)

  # TODO: put each sentence into a PQ with the score in sent_scores
  # use PQ to select the top-K most probable sentences with nlargest
  # https://docs.python.org/3/library/queue.html
  pq = queue.PriorityQueue()
  # pq.put((score,?,?))

  # TODO: tokenize all sentences in selected top-K sentences as new model_inputs
  # use pq.get() K times

  # TODO: additional step to use 2-Gram to further prune similar sentences
  # for example, we have 30 sequences, each has 5 new sentences, we now have 150
  # we use Bi-Gram counter to find similar sentences in 150, and retain only 80
  # then we put these 80 into PQ, retain only 30 most likely and also most diverse sents
  # One heuristic example:
  # A="I enjoy walking and talking in city"
  # B="I enjoy walking and running in city"
  # they have four same bigrams our of 7 words, you can set a threshold T such as
  # if #same-bi-gram > T * len(A), discard any of them.
  # Try T=0.3, 0.4, 0.5, ....,



# Out of loop
# TODO: retrieval the top-K final sentences and print them out
# bonus would be given to more diverse and more likely sentences returned



<queue.PriorityQueue object at 0x7a44df069d80>
1
2
3
4
5
6
7
8
9
10


Full: ignored