<a href="https://colab.research.google.com/github/chahatpatel2003/CSCI-167/blob/main/notebook_12_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer, set_seed
import torch, numpy as np
import torch.nn.functional as F

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

np.random.seed(1)
print("Number of tokens in dictionary =", tokenizer.vocab_size)
for _ in range(20):
  idx = np.random.randint(tokenizer.vocab_size)
  print("Token:", idx, tokenizer.decode(torch.tensor(idx), skip_special_tokens=True))

def sample_next_token(input_tokens, model, tokenizer):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  probs = F.softmax(outputs.logits, dim=-1).detach().numpy()[0, -1]
  next_id = int(np.random.choice(len(probs), p=probs))
  out = input_tokens
  out["input_ids"] = torch.cat((out['input_ids'], torch.tensor([[next_id]], dtype=torch.long)), dim=1)
  out['attention_mask'] = torch.cat((out['attention_mask'], torch.tensor([[1]])), dim=1)
  out['last_token_prob'] = float(probs[next_id])
  return out

def get_best_next_token(input_tokens, model, tokenizer):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  probs = F.softmax(outputs.logits, dim=-1).detach().numpy()[0, -1]
  next_id = int(np.argmax(probs))
  out = input_tokens
  out["input_ids"] = torch.cat((out['input_ids'], torch.tensor([[next_id]], dtype=torch.long)), dim=1)
  out['attention_mask'] = torch.cat((out['attention_mask'], torch.tensor([[1]])), dim=1)
  out['last_token_prob'] = float(probs[next_id])
  return out

def get_top_k_token(input_tokens, model, tokenizer, k=20):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  probs = F.softmax(outputs.logits, dim=-1).detach().numpy()[0, -1]
  sorted_desc = np.sort(probs)[::-1]
  k = max(1, min(k, len(sorted_desc)))
  thresh = sorted_desc[k-1]
  mask = probs >= thresh
  probs = probs * mask
  probs = probs / probs.sum()
  next_id = int(np.random.choice(len(probs), p=probs))
  out = input_tokens
  out["input_ids"] = torch.cat((out['input_ids'], torch.tensor([[next_id]], dtype=torch.long)), dim=1)
  out['attention_mask'] = torch.cat((out['attention_mask'], torch.tensor([[1]])), dim=1)
  out['last_token_prob'] = float(probs[next_id])
  return out

def get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh=0.25):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  probs = F.softmax(outputs.logits, dim=-1).detach().numpy()[0, -1]
  sorted_desc = np.sort(probs)[::-1]
  csum = np.cumsum(sorted_desc)
  idx = int(np.argmax(csum > thresh))
  print("Choosing from %d tokens" % (idx+1))
  cutoff = sorted_desc[idx]
  mask = probs >= cutoff
  probs = probs * mask
  probs = probs / probs.sum()
  next_id = int(np.random.choice(len(probs), p=probs))
  out = input_tokens
  out["input_ids"] = torch.cat((out['input_ids'], torch.tensor([[next_id]], dtype=torch.long)), dim=1)
  out['attention_mask'] = torch.cat((out['attention_mask'], torch.tensor([[1]])), dim=1)
  out['last_token_prob'] = float(probs[next_id])
  return out

def get_kth_most_likely_token(input_tokens, model, tokenizer, k):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  probs = F.softmax(outputs.logits, dim=-1).detach().numpy()[0, -1]
  order = np.argsort(probs)[::-1]
  k = min(max(0, k), len(order)-1)
  next_id = int(order[k])
  out = input_tokens
  out["input_ids"] = torch.cat((out['input_ids'], torch.tensor([[next_id]], dtype=torch.long)), dim=1)
  out['attention_mask'] = torch.cat((out['attention_mask'], torch.tensor([[1]])), dim=1)
  p = float(probs[next_id])
  out['last_token_prob'] = p
  out['log_prob'] = out['log_prob'] + np.log(p)
  return out

def print_beams(beams):
  for i, b in enumerate(beams):
    print(f"Beam {i}, Prob {float(b['log_prob']):.3f}: " + tokenizer.decode(b["input_ids"][0], skip_special_tokens=True))
  print('---')

def do_beam_search(input_tokens_in, model, tokenizer, n_beam=5, beam_length=10):
  seed_tok = dict(input_tokens_in)
  seed_tok['log_prob'] = 0.0
  beams = []
  for c in range(n_beam):
    b = dict(seed_tok)
    b = get_kth_most_likely_token(b, model, tokenizer, c)
    beams.append(b)
  print_beams(beams)
  for _ in range(beam_length-1):
    exps, scores = [], []
    for b in beams:
      for c in range(n_beam):
        nb = get_kth_most_likely_token(dict(b), model, tokenizer, c)
        exps.append(nb)
        scores.append(float(nb['log_prob']))
    idx = np.argsort(-np.array(scores))
    beams = [dict(exps[i]) for i in idx[:n_beam]]
    print_beams(beams)
  return beams[0]

set_seed(0)
t = tokenizer("The best thing about Bath is", return_tensors='pt')
for _ in range(10):
  t = sample_next_token(t, model, tokenizer)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

set_seed(0)
t = tokenizer("The best thing about Bath is", return_tensors='pt')
for _ in range(10):
  t = get_best_next_token(t, model, tokenizer)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

set_seed(0)
t = tokenizer("The best thing about Bath is", return_tensors='pt')
for _ in range(10):
  t = get_top_k_token(t, model, tokenizer, k=10)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

set_seed(0)
t = tokenizer("The best thing about Bath is", return_tensors='pt')
for _ in range(10):
  t = get_nucleus_sampling_token(t, model, tokenizer, thresh=0.2)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

set_seed(0)
t = tokenizer("The best thing about Bath is", return_tensors='pt'); t['log_prob'] = 0.0
for _ in range(10):
  t = get_kth_most_likely_token(t, model, tokenizer, k=1)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

t = tokenizer("The best thing about Bath is", return_tensors='pt'); t['log_prob'] = 0.0
for _ in range(10):
  t = get_kth_most_likely_token(t, model, tokenizer, k=2000)
  print(tokenizer.decode(t["input_ids"][0], skip_special_tokens=True))

set_seed(0)
best = do_beam_search(tokenizer("The best thing about Bath is", return_tensors='pt'), model, tokenizer, n_beam=5, beam_length=10)
print("Beam search result:")
print(tokenizer.decode(best["input_ids"][0], skip_special_tokens=True))


Number of tokens in dictionary = 50257
Token: 33003  Mormons
Token: 12172  cam
Token: 5192  trig
Token: 32511 ojure
Token: 50057  gist
Token: 43723  Petition
Token: 7813  sin
Token: 21440  Witness
Token: 32912  Remy
Token: 20609 isure
Token: 49100  creeps
Token: 7751  fasc
Token: 43757  Alc
Token: 31228  messenger
Token: 36230  SYSTEM
Token: 32025  precipitation
Token: 21758  cores
Token: 45413  Forestry
Token: 35730  guru
Token: 8444  Disc
The best thing about Bath is that
The best thing about Bath is that they
The best thing about Bath is that they don
The best thing about Bath is that they don't
The best thing about Bath is that they don't even
The best thing about Bath is that they don't even change
The best thing about Bath is that they don't even change or
The best thing about Bath is that they don't even change or shrink
The best thing about Bath is that they don't even change or shrink anymore
The best thing about Bath is that they don't even change or shrink anymore.
The best 