In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

- references
    - https://huggingface.co/docs/transformers/perplexity
    - https://www.cnblogs.com/ZJUT-jiangnan/p/5612096.html

## Auto-regressive model training loss

- 自回归模型，是词表粒度的多分类问题，用多分类问题的交叉熵定义其loss
    - 其形式为（nll，negative log likelihood）：
    
    $$
    L=-\frac1N\sum_{i=1}^N\log P(y_i)
    $$
  
    - LM head：one hot 分布（ground truth 分布）与预测概率分布的交叉熵；
        - 词表粒度的分类问题
    - 完全随机的情况下，对于 $|V|=10000$ 时，其 $\log \frac1{10000}=9.21$

- 二分类、多分类交叉熵
    
    - 二分类
    
    $$
    L=-\frac1N\sum_{i=1}^Ny_i\log P(\hat y_i)+(1-y_i)\log (1-P(\hat y_i))
    $$
    
    - 多分类
    
    $$
    L=-\frac1N\sum_{i=1}^Ny_{i,c}\log P(\hat y_{i,c})
    $$

In [6]:
import numpy as np
np.log(10000)

9.210340371976182

## PPL

> PPL：perplexity

- language model 好坏的评估指标
    - 较低的困惑度指模型的预测更加准确。

$$
PPL=\exp\left(-\frac1N\sum_{i=1}^N\log P(y_i)\right)
$$

- loss of ar model
    
    $$
    L=\log PPL
    $$
    
    - minimize L == minimize PPL

In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import load_dataset
from tqdm import tqdm
import torch

[2024-03-30 10:08:24,966] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

In [27]:
test_dataset

Dataset({
    features: ['text'],
    num_rows: 4358
})

In [13]:
def model_ppl_on_ds(gpt_model_id, test_dataset, device='cuda'):
    
    model = GPT2LMHeadModel.from_pretrained(gpt_model_id).to(device)
    tokenizer = GPT2TokenizerFast.from_pretrained(gpt_model_id)
    encodings = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)
    
    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return torch.exp(torch.stack(nlls).mean())

In [5]:
model_id = "openai-community/gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id).to('cuda')
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
encodings = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [11]:
# model.forward??

In [22]:
# The maximum sequence length that this model might ever be used with.
model.config.n_positions

1024

In [26]:
len(list(range(0, 287644, 512)))

562

In [14]:
model_id = "openai-community/gpt2"
model_ppl_on_ds(model_id, test_dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 560/562 [00:07<00:00, 71.50it/s]


tensor(25.1880, device='cuda:0')

In [16]:
model_id = "openai-community/gpt2-medium"
model_ppl_on_ds(model_id, test_dataset)

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 560/562 [00:18<00:00, 30.20it/s]


tensor(18.4739, device='cuda:0')

In [15]:
model_id = "openai-community/gpt2-large"
model_ppl_on_ds(model_id, test_dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 560/562 [00:40<00:00, 13.91it/s]


tensor(16.4541, device='cuda:0')