https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175

In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [2]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

input_ids = tokenizer("Today is a nice day", return_tensors="pt").input_ids

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [4]:
generated_outputs = gpt2.generate(input_ids, do_sample=True, num_return_sequences=3, output_scores=True)
generated_outputs.sequences

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 8888,   318,   257,  3621,  1110,   290, 11481,   477,   286,   674,
          1751,   651,  1049,  6443,   287,  1204,    13,  1081,   257,  5875],
        [ 8888,   318,   257,  3621,  1110,   284,   787,   257,  2877,   553,
          1139,   406,  1071,   494,   569,   692,    83,    11,  7632,    11],
        [ 8888,   318,   257,  3621,  1110,   329,   790,  2060,  1048,   508,
           468,  6989,   281, 27357,    13,  1002,   345,   389,   319,   262]])

In [20]:
# only use id's that were generated
# gen_sequences has shape [3, 15]
gen_sequences = generated_outputs.sequences[:, input_ids.shape[-1]:]
gen_sequences.shape

torch.Size([3, 15])

In [8]:
# let's stack the logits generated at each step to a tensor and transform
# logits to probs
probs = torch.stack(generated_outputs.scores, dim=1).softmax(-1)  # -> shape [3, 15, vocab_size]
probs.shape

torch.Size([3, 15, 50257])

In [9]:
# now we need to collect the probability of the generated token
# we need to add a dummy dim in the end to make gather work
gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
gen_probs

tensor([[0.0292, 0.0032, 0.0143, 0.4187, 0.0746, 0.0217, 0.0409, 0.0011, 0.0574,
         0.0563, 0.4223, 0.3332, 0.0133, 0.1539, 0.0083],
        [0.0843, 0.0108, 0.2449, 0.0232, 0.0147, 0.0859, 0.0235, 0.0102, 0.1379,
         0.0037, 0.0266, 0.0066, 0.6087, 0.0070, 0.8728],
        [0.2363, 0.0053, 0.1107, 0.3435, 0.2041, 0.1062, 0.0064, 0.0427, 0.0410,
         0.2816, 0.0189, 0.6259, 0.1617, 0.0288, 0.3960]])

In [24]:
torch.logsumexp(gen_probs, dim=1) / torch.logsumexp(gen_probs, dim=1).sum()

tensor([0.3284, 0.3352, 0.3363])

In [19]:
# now we can do all kinds of things with the probs

# 1) the probs that exactly those sequences are generated again
# those are normally going to be very small
unique_prob_per_sequence = gen_probs.prod(-1)
unique_prob_per_sequence / unique_prob_per_sequence.sum(0) * 100

tensor([4.4861e-04, 7.3639e-04, 9.9999e+01])

In [12]:
# 2) normalize the probs over the three sequences
normed_gen_probs = gen_probs / gen_probs.sum(0)
assert normed_gen_probs[:, 0].sum() == 1.0, "probs should be normalized"

# 3) compare normalized probs to each other like in 1)
unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)

In [16]:
unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)
unique_normed_prob_per_sequence

tensor([9.7732e-14, 1.6043e-13, 2.1785e-08])