In [None]:
#!pip install transformers

In [None]:
#!pip install nlp

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
device = 'cuda'
model_id = 'gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:

from nlp import load_dataset
test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
encodings["input_ids"].shape

torch.Size([1, 287644])

In [None]:
from tqdm import tqdm
import torch


max_length = model.config.n_positions
stride = 512

lls = []
for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
    begin_loc = max(i + stride - max_length, 0)
    end_loc = min(i + stride, encodings.input_ids.size(1))
    trg_len = end_loc - i    # may be different from stride on last loop
    input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:,:-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        log_likelihood = outputs[0] * trg_len

    lls.append(log_likelihood)

ppl = torch.exp(torch.stack(lls).sum() / end_loc)

100%|██████████| 562/562 [06:29<00:00,  1.44it/s]


In [None]:
ppl

tensor(16.4444, device='cuda:0')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving target_with_predictions_and_labels_v-0-5 (1) (1).json to target_with_predictions_and_labels_v-0-5 (1) (1) (2).json


In [None]:
import json 
model_result_v06 = json.load(open("target_with_predictions_and_labels_v-0-6 (1).json"))

In [None]:
predicted_sents = [" ".join(i["predictions"]) for i in model_result_v06]
predicted_sents[:1]

['But the pool is clean up imp ##ec ##ca ##bly , and the locker room is nice ( except during the summer time when the kids cut everything sticky ) A great facility overall .']

In [None]:
def bpe2token(list_bpe: list) -> list:
  tokens = []
  current_token = []
  for bpe_token in list_bpe:
    if not current_token:
      current_token.append(bpe_token)
    elif bpe_token.startswith("##"):
      current_token.append(bpe_token)
    else:
      tokens.append("".join([token if itoken == 0 else token[2:] for itoken, token in enumerate(current_token)]))
      current_token = [bpe_token]
  if current_token:
    tokens.append("".join([token if itoken == 0 else token[2:] for itoken, token in enumerate(current_token)]))

  return tokens

bpe2token(model_result_v06[0]["predictions"])

['But',
 'the',
 'pool',
 'is',
 'clean',
 'up',
 'impeccably',
 ',',
 'and',
 'the',
 'locker',
 'room',
 'is',
 'nice',
 '(',
 'except',
 'during',
 'the',
 'summer',
 'time',
 'when',
 'the',
 'kids',
 'cut',
 'everything',
 'sticky',
 ')',
 'A',
 'great',
 'facility',
 'overall',
 '.']

In [None]:
def get_no_BPE(pred):
    sent = []
    bpe_ind = []
    stack =[]
    for i in range(len(pred)):
        
        if "##" in pred[i]:
            bpe_ind.append(pred.index(pred[i]))
        else:
            if len(bpe_ind)>0:
                sent.extend(stack[:-1])
                start = stack[-1]
                for p in bpe_ind:
                    start+=pred[p][2:]
                sent.append(start)
                stack = [pred[i]]
                bpe_ind = []
                start = ""
            else: 
                stack.append(pred[i])
    if len(stack)>0:
        sent.extend(stack)
    return sent

for i in model_result_v06:
  res1= get_no_BPE(i["predictions"])
  res2= bpe2token(i["predictions"])
  if res1 != res2 :
    print(res1)
    print(res2)
    print()
    


In [None]:
model_result_v06[:1]

[{'predictions': ['But',
   'the',
   'pool',
   'is',
   'clean',
   'up',
   'imp',
   '##ec',
   '##ca',
   '##bly',
   ',',
   'and',
   'the',
   'locker',
   'room',
   'is',
   'nice',
   '(',
   'except',
   'during',
   'the',
   'summer',
   'time',
   'when',
   'the',
   'kids',
   'cut',
   'everything',
   'sticky',
   ')',
   'A',
   'great',
   'facility',
   'overall',
   '.'],
  'senti_tags': [0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   -1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'tokenized_text': ['But',
   'the',
   'pool',
   'is',
   'kept',
   'up',
   'impeccably',
   ',',
   'and',
   'the',
   'locker',
   'room',
   'is',
   'nice',
   '(',
   'except',
   'during',
   'the',
   'summer',
   'time',
   'when',
   'the',
   'kids',
   'make',
   'everything',
   'sticky',
   ')',
   'A',
   'great',
   'facility',
   'overall',
   '.']}]

In [None]:
from tqdm import tqdm_notebook as tqdm
import torch

from typing import List

def count_ppl(predictions: List[dict], label = "predictions"):

  lls = []
  n_bpe_tokens = 0
  for pred in tqdm(predictions):
      pred = pred[label]
      sentence = " ".join(get_no_BPE(pred))
      encodings = tokenizer(sentence, return_tensors='pt')
      input_ids = encodings.input_ids.to(device)
      target_ids = input_ids.clone()

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)
          log_likelihood = outputs[0] * input_ids.size(1)
          n_bpe_tokens += input_ids.size(1)

      lls.append(log_likelihood)

  ppl = torch.exp(torch.stack(lls).sum() / n_bpe_tokens)
  return ppl


In [None]:
source_text_ppl = count_ppl(model_result_v06, label="tokenized_text")
source_text_ppl

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(76.5649, device='cuda:0')

In [None]:
ppl = count_ppl(model_result_v06)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




In [None]:
ppl

tensor(128.4332, device='cuda:0')

In [None]:
model_result_v05 = json.load(open("target_with_predictions_and_labels_v-0-5 (1) (1) (2).json"))
ppl_model_05 = count_ppl(model_result_v05)
ppl_model_05

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(163.6606, device='cuda:0')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving target_with_predictions_and_labels_v-0-4 (1).json to target_with_predictions_and_labels_v-0-4 (1).json


In [None]:
model_result_v04 = json.load(open("target_with_predictions_and_labels_v-0-4 (1).json"))
ppl_model_04 = count_ppl(model_result_v04)
ppl_model_04

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(261.6927, device='cuda:0')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving target_with_predictions_and_labels_v-0-3 (1).json to target_with_predictions_and_labels_v-0-3 (1).json


In [None]:
model_result_v03 = json.load(open("target_with_predictions_and_labels_v-0-3 (1).json"))
ppl_model_03 = count_ppl(model_result_v03)
ppl_model_03

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(153.9017, device='cuda:0')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving target_with_predictions_and_labels_v-0-2 (1).json to target_with_predictions_and_labels_v-0-2 (1).json


In [None]:
model_result_v02 = json.load(open("target_with_predictions_and_labels_v-0-2 (1).json"))
ppl_model_02 = count_ppl(model_result_v02)
ppl_model_02

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(92.7376, device='cuda:0')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving target_with_predictions_and_labels (1).json to target_with_predictions_and_labels (1).json


In [None]:
model_result_v01 = json.load(open("target_with_predictions_and_labels (1).json"))
ppl_model_01 = count_ppl(model_result_v01)
ppl_model_01

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=12802.0), HTML(value='')))




tensor(132.3095, device='cuda:0')

In [None]:
from tqdm import tqdm_notebook as tqdm
import torch

def ppl_on_full_text(predictions: List[dict], label="predictions"):
    text = "\n\n".join([' '.join(i[label]) for i in predictions])
    encodings = tokenizer(text, return_tensors='pt')
    max_length = model.config.n_positions
    stride = 1024

    lls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i    # may be different from stride on last loop
        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    return ppl


In [None]:
for index, model_res in enumerate([model_result_v01, model_result_v02,model_result_v03, model_result_v04, model_result_v05, model_result_v06]):
  ppl_on_merged_text = ppl_on_full_text(model_res)
  print(f"PPL on model #{index + 1}: {ppl_on_merged_text}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=346.0), HTML(value='')))


PPL on model #1: 50.89130401611328


HBox(children=(FloatProgress(value=0.0, max=348.0), HTML(value='')))


PPL on model #2: 40.093055725097656


HBox(children=(FloatProgress(value=0.0, max=394.0), HTML(value='')))


PPL on model #3: 54.69341278076172


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


PPL on model #4: 51.44259262084961


HBox(children=(FloatProgress(value=0.0, max=349.0), HTML(value='')))


PPL on model #5: 55.12382507324219


HBox(children=(FloatProgress(value=0.0, max=347.0), HTML(value='')))


PPL on model #6: 51.25361633300781


In [None]:
ppl_on_merged_text = ppl_on_full_text(model_result_v06, label="tokenized_text")
print(f"PPL on gold text: {ppl_on_merged_text}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=301.0), HTML(value='')))


PPL on gold text: 23.19408416748047


[{'predictions': ['But',
   'the',
   'pool',
   'is',
   'broken',
   'up',
   'imp',
   '##ec',
   '##ca',
   '##bly',
   ',',
   'and',
   'the',
   'locker',
   'room',
   'is',
   'nice',
   '(',
   'except',
   'during',
   'the',
   'summer',
   'time',
   'when',
   'the',
   'kids',
   'make',
   'everything',
   'sticky',
   ')',
   'A',
   'great',
   'facility',
   'overall',
   '.'],
  'senti_tags': [0,
   0,
   0,
   0,
   -1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'tokenized_text': ['But',
   'the',
   'pool',
   'is',
   'kept',
   'up',
   'impeccably',
   ',',
   'and',
   'the',
   'locker',
   'room',
   'is',
   'nice',
   '(',
   'except',
   'during',
   'the',
   'summer',
   'time',
   'when',
   'the',
   'kids',
   'make',
   'everything',
   'sticky',
   ')',
   'A',
   'great',
   'facility',
   'overall',
   '.']},
 {'predictions': 