In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import math


In [2]:
model_name = "gpt2"  # You can try "gpt2-medium" or "gpt2-large" if desired.
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

model.eval()  # Put model in evaluation mode.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
def get_token_log_probs(text):
    # Tokenize input text; return offsets to help group tokens into words.
    encoding = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding.input_ids  # shape: (1, sequence_length)
    offsets = encoding.offset_mapping[0]  # list of (start, end) for each token

    # Get model outputs with labels set to input_ids so that loss is computed for each token.
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    # outputs.logits shape: (1, sequence_length, vocab_size)
    logits = outputs.logits

    # Calculate log probabilities
    log_probs = torch.log_softmax(logits, dim=-1)  # shape: (1, sequence_length, vocab_size)

    # For each token, extract the log probability for the correct token (i.e. from input_ids)
    # We shift the inputs so that for token at position i, we get probability from the previous context.
    # For simplicity, we ignore the first token in the sequence.
    token_log_probs = log_probs[0, :-1].gather(1, input_ids[0, 1:].unsqueeze(1)).squeeze(1)
    # Now token_log_probs is a tensor of shape (sequence_length - 1,)
    # Return this along with token offsets and tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    return tokens[1:], token_log_probs.tolist(), offsets[1:]


In [4]:
def compute_word_level_perplexity(text):
    tokens, token_log_probs, offsets = get_token_log_probs(text)

    words = []
    word_log_probs = []
    current_word_tokens = []
    current_log_probs = []

    for token, logp in zip(tokens, token_log_probs):
        # In GPT2 tokenization, tokens starting with "Ġ" indicate a new word.
        if token.startswith("Ġ"):
            # If we already have tokens for a word, finalize the current word:
            if current_word_tokens:
                words.append("".join(current_word_tokens).strip())
                # Compute the average log probability for the word:
                avg_log_prob = sum(current_log_probs) / len(current_log_probs)
                word_log_probs.append(avg_log_prob)
            # Start a new word, remove the "Ġ" marker
            current_word_tokens = [token.lstrip("Ġ")]
            current_log_probs = [logp]
        else:
            # Append token (might be a continuation of the current word)
            current_word_tokens.append(token)
            current_log_probs.append(logp)
    # Finalize the last word:
    if current_word_tokens:
        words.append("".join(current_word_tokens).strip())
        avg_log_prob = sum(current_log_probs) / len(current_log_probs)
        word_log_probs.append(avg_log_prob)

    # Convert average log probabilities to word-level perplexities:
    word_perplexities = [math.exp(-lp) for lp in word_log_probs]

    return words, word_perplexities



In [6]:
# Example usage:
text = "HELLO THIS IS THE ANSWER TO YOU."
words, word_perps = compute_word_level_perplexity(text)
for w, p in zip(words, word_perps):
    print(f"Word: {w}, Perplexity: {p:.2f}")


Word: LLO, Perplexity: 236.34
Word: THIS, Perplexity: 5700.81
Word: IS, Perplexity: 4.73
Word: THE, Perplexity: 10.23
Word: ANSWER, Perplexity: 14.19
Word: TO, Perplexity: 3.47
Word: YOU., Perplexity: 10.09


In [None]:
text_visual = "HELLO THIS IS THE ANSWER TO YOU."
