In [1]:
# In google colab, make sure you install transformers
# uncomment the following line for first-time execution
!pip install transformers



In [3]:
import torch
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM

set_seed(123)

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

# HW Code Question: Implement your own Beam Search with Priority Queue

In [7]:
import queue

# initialize a pq
K = 30

# we generate a sentence with 10 new words, "I enjoy [word]*10"
max_len=10
model_inputs = tokenizer('I enjoy', return_tensors='pt')


# repeat the generation loop

for i in range(max_len):

  # use gpt to decode one word at a time, DO NOT MODIFY any argument
  out = model.generate(**model_inputs, use_cache=True, max_new_tokens=1,
                         num_beams=50, num_return_sequences=K, output_scores=True,
                         return_dict_in_generate=True)
  sentences = out.sequences
  sent_scores = out.sequences_scores
  responses = tokenizer.batch_decode(sentences)
  print(i,responses)

  # TODO: put each sentence into a PQ with the score in sent_scores
  # https://docs.python.org/3/library/queue.html
  pq = queue.PriorityQueue()
  # pq.put((score,?,?))
  # TODO: tokenize all sentences in selected top-K sentences as new model_inputs
  # use pq.get() K times to get K sents of largest scores

  # TODO: additional step to use 2-Gram to further prune similar sentences
  # for example, for first round, we have K=30 sequences as return
  # for second round, each has K=30 new sentences with one additional predicted word, we now have 900
  # we use Bi-Gram counter to find similar sentences in 900, and retain a portion of it (e.g., 900*10%=90) for diversity, since memory is limited
  # then we put these them into PQ, retrieve only K=30 most probably sentences (also diverse)
  # One heuristic example:
  # A="I enjoy walking and talking in city"
  # B="I enjoy walking and running in city"
  # they have four same bigrams out of 7 words, you can set a threshold T such as
  # if #same-bi-gram > T * len(A), retain only one of them.
  # Try T=0.3, 0.4, 0.5, ....,



# Out of loop
# TODO: retrieval the top-K final sentences and print them out
# bonus would be given to more diverse and more likely sentences returned





0 ['I enjoy the', 'I enjoy it', 'I enjoy this', 'I enjoy my', 'I enjoy that', 'I enjoy being', 'I enjoy playing', 'I enjoy a', 'I enjoy your', 'I enjoy reading', 'I enjoy to', 'I enjoy watching', 'I enjoy writing', 'I enjoy having', 'I enjoy doing', 'I enjoy working', 'I enjoy using', 'I enjoy you', 'I enjoy all', 'I enjoy making', 'I enjoy seeing', 'I enjoy hearing', 'I enjoy them', 'I enjoy what', 'I enjoy going', 'I enjoy these', 'I enjoy our', 'I enjoy and', 'I enjoy getting', 'I enjoy learning']
1 ['I enjoy the', 'I enjoy it', 'I enjoy this', 'I enjoy my', 'I enjoy that', 'I enjoy being', 'I enjoy playing', 'I enjoy a', 'I enjoy your', 'I enjoy reading', 'I enjoy to', 'I enjoy watching', 'I enjoy writing', 'I enjoy having', 'I enjoy doing', 'I enjoy working', 'I enjoy using', 'I enjoy you', 'I enjoy all', 'I enjoy making', 'I enjoy seeing', 'I enjoy hearing', 'I enjoy them', 'I enjoy what', 'I enjoy going', 'I enjoy these', 'I enjoy our', 'I enjoy and', 'I enjoy getting', 'I enjoy