In [None]:
# This example is modified from

# https://huggingface.co/gpt2
# https://huggingface.co/blog/how-to-generate

In [None]:
# In google colab, make sure you install transformers
# uncomment the following line for first-time execution
!pip install transformers



In [None]:
import torch
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM

set_seed(123)

In [None]:
# In colab, this automatically downloads gpt2 model from Hugginface.
# If you run this locally, you need to download gpt2 by yourself through vpn, and change to your local directory path.

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Method 1: Use Transformers Pipeline to direct generate new sentences

This utilizes pipeline tool, which can search for num_return_sequences of sentences. It's the easiest way to prompt and get response.

In [None]:
query_text = "Name a good tennis player."
generator = pipeline('text-generation', model='gpt2')
generator(query_text, max_length=100, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
qa_model = pipeline("question-answering", model='gpt2')

question = "Who is the best tennis player?"
context = "I am a tennis fan. I think the person with most number of Grand Slams is the best player."
qa_model(question = question, context = context)



Check out different mode you can choose, such as translation, question-answering, so on.
https://huggingface.co/transformers/v3.0.2/main_classes/pipelines.html

# Method 2: We let the model generate (forward) and decode back to words

The real code of "generate()" function is https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1351

You can read the following examples first, then check the code.

## Example 1: Question Answering

This is more flexible. For teaching purpose, this code lets you understand each step.

In [None]:
# tokenize input prompt
question_text = "What is 1+2?"
encoded_input = tokenizer(question_text, return_tensors='tf')
print(encoded_input)


In [None]:
# you can define generation args as a dictionary, or pass them manually in generate() function
# check https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py#L40

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 16,
    "num_return_sequences":10,
}

In [None]:
# decode the question to answer

model_1 = GPT2LMHeadModel.from_pretrained('gpt2')
model_2 = GPT2LMHeadModel.from_pretrained("danyaljj/gpt2_question_answering_squad2")

input_ids = tokenizer.encode("You are good at math. Q: What is one plus two ? A:", return_tensors="pt")
outputs = model_1.generate(input_ids)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
outputs = model_2.generate(input_ids)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))


## Example 2: Text Generation

Firstly, read the API and understand the arguments of "generate()"

https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/text_generation#transformers.TFGenerationMixin.generate

In [None]:
# Important Argument
#   max_new_tokens -- length of output
#   num_return_sequences -- number of returned responses
#   use_cache -- use KV cache to speed inference, see next section

Greedy decoding -- choose most probable next-word

In [None]:
# decode the input to complete the text
model_3 = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

#model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt')
model_inputs = tokenizer('I enjoy playing', return_tensors='pt')

# generate 40 new tokens
greedy_output = model_3.generate(**model_inputs, max_new_tokens=2)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
print(model_inputs.keys())

## Beam Search to generate diverse sentences

In [None]:
# use beam search to generate several sentence candidates
# read https://huggingface.co/blog/how-to-generate for details

# activate beam search and early_stopping
beam_output = model_3.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    early_stopping=True,
    num_return_sequences=5,
    use_cache=True
)

print("Output:\n" + 100 * '-')
# print(beam_output)
for ix, out in enumerate(beam_output):
  print(ix, tokenizer.decode(out, skip_special_tokens=True))

In [None]:
# use beam search to generate several sentence candidates
# without repeatitive n-gram
model_inputs = tokenizer('I enjoy playing', return_tensors='pt')

# activate beam search and early_stopping
beam_output = model_3.generate(
    **model_inputs,
    max_new_tokens=10,
    num_beams=100,
    early_stopping=True,
    num_return_sequences=5,
    use_cache=True,
    no_repeat_ngram_size=2     # dont not allow similar 2-gram appear twice
)

print("Output:\n" + 100 * '-')
# print(beam_output)
for ix, out in enumerate(beam_output):
  print(ix, tokenizer.decode(out, skip_special_tokens=True))

 ## Tutorial: KV Cache

check KV cache option
https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py#L100

check generate() code https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1351

First, check the speed of using or not using KV Cache

In [None]:
import numpy as np
from time import time

t1 = time()
output_tensors_1 = model_3.generate(**model_inputs, use_cache=True, max_new_tokens=40, num_beams=5, num_return_sequences=5)
t2 = time()
output_tensors_2 = model_3.generate(**model_inputs, use_cache=False, max_new_tokens=40, num_beams=5, num_return_sequences=5)
t3= time()

print('Use KV Cache time:', np.round(t2-t1,2))
print('NOT USE KV Cache time is much longer:', np.round(t3-t2,2))



The use_cache=True/False does not affect the output. The responses should be the same. Let's check.

In [None]:
responses_1 = tokenizer.batch_decode(output_tensors_1)
responses_2 = tokenizer.batch_decode(output_tensors_2)

# Check use_cache=True(R1) and False(R2) if they are the same
for response_1, response_2 in zip(responses_1, responses_2):
  print('R1:', response_1, '\n')
  print('R2:', response_2,'\n\n')


Check code and see what's the inference behavior of generate
Review lecture notes again.

Read [tutorial](https://r4j4n.github.io/blogs/posts/kv/) and [tutorial](https://mett29.github.io/posts/kv-cache/), understand that KV cache is at Attention level.

Turning cache on, each ATT layer will store previous attention all K variables as K_list and all V variables as V_list. Then each step appends newly computed K and V to the list.

In real code, you can check GPT-2 attention [code](https://github.com/huggingface/transformers/blob/ae093eef016533a3670561fa9e26addb42d446d1/src/transformers/models/gpt2/modeling_gpt2.py#L901) and see this behavior.

In [None]:
print(model_1)
