<a href="https://colab.research.google.com/github/chahatpatel2003/CSCI-167/blob/main/notebook_12__4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer, set_seed
import torch
import torch.nn.functional as F
import numpy as np

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

np.random.seed(1)
print("Number of tokens in dictionary = %d"%(tokenizer.vocab_size))
for i in range(20):
  index = np.random.randint(tokenizer.vocab_size)
  print("Token: %d "%(index)+tokenizer.decode(torch.tensor(index), skip_special_tokens=True))

def sample_next_token(input_tokens, model, tokenizer):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]
  next_token = np.random.choice(len(prob_over_tokens), 1, replace=False, p=prob_over_tokens)
  output_tokens = input_tokens
  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)
  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'], torch.tensor([[1]])), dim=1)
  output_tokens['last_token_prob'] = prob_over_tokens[next_token]
  return output_tokens

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = sample_next_token(input_tokens, model, tokenizer)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = sample_next_token(input_tokens, model, tokenizer)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

def get_best_next_token(input_tokens, model, tokenizer):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]
  next_token = [int(np.argmax(prob_over_tokens))]
  output_tokens = input_tokens
  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)
  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'], torch.tensor([[1]])), dim=1)
  output_tokens['last_token_prob'] = prob_over_tokens[next_token]
  return output_tokens

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_best_next_token(input_tokens, model, tokenizer)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_best_next_token(input_tokens, model, tokenizer)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

def get_top_k_token(input_tokens, model, tokenizer, k=20):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]
  sorted_prob_over_tokens = np.sort(prob_over_tokens)[::-1]
  k = max(1, min(k, len(sorted_prob_over_tokens)))
  kth_prob_value = sorted_prob_over_tokens[k-1]
  prob_over_tokens[prob_over_tokens < kth_prob_value] = 0.0
  prob_over_tokens = prob_over_tokens / np.sum(prob_over_tokens)
  next_token = np.random.choice(len(prob_over_tokens), 1, replace=False, p=prob_over_tokens)
  output_tokens = input_tokens
  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)
  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'], torch.tensor([[1]])), dim=1)
  output_tokens['last_token_prob'] = prob_over_tokens[next_token]
  return output_tokens

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_top_k_token(input_tokens, model, tokenizer, k=10)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_top_k_token(input_tokens, model, tokenizer, k=10)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

def get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh=0.25):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]
  sorted_probs_decreasing = np.sort(prob_over_tokens)[::-1]
  cum_sum_probs = np.cumsum(sorted_probs_decreasing)
  idx = int(np.argmax(cum_sum_probs > thresh))
  print("Choosing from %d tokens"%(idx))
  thresh_prob = sorted_probs_decreasing[idx]
  prob_over_tokens[prob_over_tokens < thresh_prob] = 0.0
  prob_over_tokens = prob_over_tokens / np.sum(prob_over_tokens)
  next_token = np.random.choice(len(prob_over_tokens), 1, replace=False, p=prob_over_tokens)
  output_tokens = input_tokens
  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)
  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'], torch.tensor([[1]])), dim=1)
  output_tokens['last_token_prob'] = prob_over_tokens[next_token]
  return output_tokens

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh=0.2)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
for i in range(10):
  input_tokens = get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh=0.2)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

def get_kth_most_likely_token(input_tokens, model, tokenizer, k):
  outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'])
  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]
  sorted_prob_over_tokens = np.sort(prob_over_tokens)[::-1]
  k = min(max(0, k), len(sorted_prob_over_tokens)-1)
  kth_prob_value = sorted_prob_over_tokens[k]
  next_token = np.where(prob_over_tokens == kth_prob_value)[0][0:1]
  output_tokens = input_tokens
  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)
  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'], torch.tensor([[1]])), dim=1)
  p = float(prob_over_tokens[next_token])
  output_tokens['last_token_prob'] = p
  output_tokens['log_prob'] = output_tokens['log_prob'] + np.log(p)
  return output_tokens

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
input_tokens['log_prob'] = 0.0
for i in range(10):
  input_tokens = get_kth_most_likely_token(input_tokens, model, tokenizer, k=1)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
input_tokens['log_prob'] = 0.0
for i in range(10):
  input_tokens = get_kth_most_likely_token(input_tokens, model, tokenizer, k=2000)
  print(tokenizer.decode(input_tokens["input_ids"][0], skip_special_tokens=True))

def print_beams(beams):
  for index, beam in enumerate(beams):
    print("Beam %d, Prob %3.3f: "%(index, float(beam['log_prob'])) + tokenizer.decode(beam["input_ids"][0], skip_special_tokens=True))
  print('---')

def do_beam_search(input_tokens_in, model, tokenizer, n_beam=5, beam_length=10):
  input_tokens_in = dict(input_tokens_in)
  input_tokens_in['log_prob'] = 0.0
  beams = [None]*n_beam
  for c_k in range(n_beam):
    beams[c_k] = dict(input_tokens_in)
    beams[c_k] = get_kth_most_likely_token(beams[c_k], model, tokenizer, c_k)
  print_beams(beams)
  for _ in range(beam_length-1):
    beams_all = [None]*(n_beam*n_beam)
    log_probs_all = np.zeros(n_beam*n_beam)
    for c_beam in range(n_beam):
      for c_k in range(n_beam):
        beams_all[c_beam*n_beam + c_k] = dict(get_kth_most_likely_token(beams[c_beam], model, tokenizer, c_k))
        log_probs_all[c_beam*n_beam + c_k] = float(beams_all[c_beam*n_beam + c_k]['log_prob'])
    sorted_index = np.argsort(-log_probs_all)
    for c_k in range(n_beam):
      beams[c_k] = dict(beams_all[sorted_index[c_k]])
    print_beams(beams)
  return beams[0]

set_seed(0)
input_txt = "The best thing about Bath is"
input_tokens = tokenizer(input_txt, return_tensors='pt')
best_beam = do_beam_search(input_tokens, model, tokenizer, n_beam=5, beam_length=10)
print("Beam search result:")
print(tokenizer.decode(best_beam["input_ids"][0], skip_special_tokens=True))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Number of tokens in dictionary = 50257
Token: 33003  Mormons
Token: 12172  cam
Token: 5192  trig
Token: 32511 ojure
Token: 50057  gist
Token: 43723  Petition
Token: 7813  sin
Token: 21440  Witness
Token: 32912  Remy
Token: 20609 isure
Token: 49100  creeps
Token: 7751  fasc
Token: 43757  Alc
Token: 31228  messenger
Token: 36230  SYSTEM
Token: 32025  precipitation
Token: 21758  cores
Token: 45413  Forestry
Token: 35730  guru
Token: 8444  Disc


  output_tokens["input_ids"] = torch.cat((output_tokens['input_ids'], torch.tensor([next_token])), dim=1)


The best thing about Bath is that
The best thing about Bath is that they
The best thing about Bath is that they don
The best thing about Bath is that they don't
The best thing about Bath is that they don't even
The best thing about Bath is that they don't even change
The best thing about Bath is that they don't even change or
The best thing about Bath is that they don't even change or shrink
The best thing about Bath is that they don't even change or shrink anymore
The best thing about Bath is that they don't even change or shrink anymore.
The best thing about Bath is your
The best thing about Bath is your kids
The best thing about Bath is your kids will
The best thing about Bath is your kids will definitely
The best thing about Bath is your kids will definitely be
The best thing about Bath is your kids will definitely be up
The best thing about Bath is your kids will definitely be up the
The best thing about Bath is your kids will definitely be up the chim
The best thing about Bath is

  p = float(prob_over_tokens[next_token])


The best thing about Bath is the way
The best thing about Bath is the way you
The best thing about Bath is the way you get
The best thing about Bath is the way you get the
The best thing about Bath is the way you get the most
The best thing about Bath is the way you get the most bang
The best thing about Bath is the way you get the most bang out
The best thing about Bath is the way you get the most bang outta
The best thing about Bath is the way you get the most bang outta the
The best thing about Bath is mixed
The best thing about Bath is mixed profits
The best thing about Bath is mixed profits partnerships
The best thing about Bath is mixed profits partnerships»
The best thing about Bath is mixed profits partnerships» buy
The best thing about Bath is mixed profits partnerships» buy generic
The best thing about Bath is mixed profits partnerships» buy generic+
The best thing about Bath is mixed profits partnerships» buy generic+ Honda
The best thing about Bath is mixed profits partners