In [4]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)


[A
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.53MB/s]

Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 48.2MB/s]

[A
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.00MB/s]


In [5]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Downloading model.safetensors:  13%|█▎        | 430M/3.25G [06:03<39:38, 1.18MB/s]
Downloading model.safetensors:  67%|██████▋   | 2.17G/3.25G [05:15<02:36, 6.88MB/s]
Found cached dataset wikitext (/home/jina/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [13]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    print('max_length', max_length)
    print('begin_loc', begin_loc)
    print('stride', stride)
    print('seq_len', seq_len)
    end_loc = min(begin_loc + max_length, seq_len)
    print('end_loc', end_loc)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100
    print('trg_len', trg_len)
    print(input_ids.shape)
    print('input_ids', input_ids)
    print('target_ids', target_ids)
    print('-100', (target_ids==-100).sum())
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    print('prev_end_loc', prev_end_loc)
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

  0%|          | 0/562 [00:00<?, ?it/s]

max_length 1024
begin_loc 0
stride 512
seq_len 287644
end_loc 1024
trg_len 1024
torch.Size([1, 1024])
input_ids tensor([[  628,   796,  5199,  ...,   290, 43823, 24265]])
target_ids tensor([[  628,   796,  5199,  ...,   290, 43823, 24265]])
-100 tensor(0)


  0%|          | 1/562 [00:08<1:19:36,  8.51s/it]

prev_end_loc 1024
max_length 1024
begin_loc 512
stride 512
seq_len 287644
end_loc 1536
trg_len 512
torch.Size([1, 1024])
input_ids tensor([[  679,  4120,   287,  ..., 25930,   393,   347]])
target_ids tensor([[ -100,  -100,  -100,  ..., 25930,   393,   347]])
-100 tensor(512)


  0%|          | 2/562 [00:16<1:18:52,  8.45s/it]

prev_end_loc 1536
max_length 1024
begin_loc 1024
stride 512
seq_len 287644
end_loc 2048
trg_len 512
torch.Size([1, 1024])
input_ids tensor([[7924,  416,  440,  ...,  272,  764,  679]])
target_ids tensor([[-100, -100, -100,  ...,  272,  764,  679]])
-100 tensor(512)


  1%|          | 3/562 [00:25<1:19:46,  8.56s/it]

prev_end_loc 2048
max_length 1024
begin_loc 1536
stride 512
seq_len 287644
end_loc 2560
trg_len 512
torch.Size([1, 1024])
input_ids tensor([[3885,  417, 7626,  ...,  923,  286,  281]])
target_ids tensor([[-100, -100, -100,  ...,  923,  286,  281]])
-100 tensor(512)


  1%|          | 4/562 [00:34<1:20:37,  8.67s/it]

prev_end_loc 2560
max_length 1024
begin_loc 2048
stride 512
seq_len 287644
end_loc 3072
trg_len 512
torch.Size([1, 1024])
input_ids tensor([[4054,  837,  284,  ...,  262, 3931,  286]])
target_ids tensor([[-100, -100, -100,  ...,  262, 3931,  286]])
-100 tensor(512)


  1%|          | 5/562 [00:43<1:21:50,  8.82s/it]

prev_end_loc 3072
max_length 1024
begin_loc 2560
stride 512
seq_len 287644
end_loc 3584
trg_len 512
torch.Size([1, 1024])
input_ids tensor([[ 1743,  3451,   764,  ...,   339,  4855, 10343]])
target_ids tensor([[ -100,  -100,  -100,  ...,   339,  4855, 10343]])
-100 tensor(512)


  1%|          | 5/562 [00:49<1:31:14,  9.83s/it]


KeyboardInterrupt: 

In [24]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    # neg_log_likelihood = 250
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    print(neg_log_likelihood)
    nlls.append(neg_log_likelihood)
    print(neg_log_likelihood.shape)

    prev_end_loc = end_loc
    if end_loc == stride*3: # seq_len
        break

print(nlls)
print(len(nlls))
ppl = torch.exp(torch.stack(nlls).mean())

  0%|          | 1/562 [00:07<1:08:23,  7.32s/it]

tensor(2.4563)
torch.Size([])


  0%|          | 1/562 [00:13<2:10:44, 13.98s/it]

tensor(3.1668)
torch.Size([])
[tensor(2.4563), tensor(3.1668)]
2





In [26]:
torch.stack(nlls)

tensor([2.4563, 3.1668])

In [18]:
stride

512

In [19]:
seq_len/stride

561.8046875