In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. Setup
# Note: You may need to run `huggingface-cli login` first if the model is gated.
# If 'google/gemma-3-270m' is not available, try 'google/gemma-2-2b' or the specific ID you are using.
MODEL_ID = "google/gemma-3-270M" 

print(f"Loading {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    device_map="auto", 
    dtype=torch.float32
)
model.eval()

# 2. Define Sample Sentences
sentences = [
    "I'm originally from maine", 
    "I'm originally from man", 
]

print("-" * 50)
print(f"{'Sentence':<50} | {'Log Prob':<10} | {'Perplexity':<10}")
print("-" * 50)

# 3. Compute Log Probs
for text in sentences:
    # Tokenize
    # We add the BOS token (if not added by default) so the model has context for the first word
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(model.device)
    input_ids = inputs.input_ids
    
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # 4. Shift and Score
    # Shift logits and labels so that logits[i] predicts labels[i+1]
    # logits shape: [batch, seq_len, vocab]
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = input_ids[..., 1:].contiguous()

    # Flatten tokens
    # log_probs shape: [batch * (seq_len - 1), vocab]
    log_probs = F.log_softmax(shift_logits, dim=-1)
    
    # Gather the log prob of the actual target token at each step
    # gather dim=1 because we flattened the batch/seq dims or simple indexing
    # Let's keep it simple with gather on the last dimension
    
    # Iterate through the sequence to sum probabilities
    sentence_log_prob = 0.0
    for i in range(shift_labels.shape[1]):
        token_id = shift_labels[0, i]
        token_log_prob = log_probs[0, i, token_id].item()
        sentence_log_prob += token_log_prob
        
    # Optional: Calculate Perplexity
    # N = number of tokens predicted (seq_len - 1)
    N = shift_labels.shape[1]
    perplexity = torch.exp(torch.tensor(-sentence_log_prob / N)).item()

    print(f"{text[:47]+'...':<50} | {sentence_log_prob:<10.2f} | {perplexity:<10.2f}")

  from .autonotebook import tqdm as notebook_tqdm


Loading google/gemma-3-270M...


Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


--------------------------------------------------
Sentence                                           | Log Prob   | Perplexity
--------------------------------------------------
I'm originally from maine...                       | -28.47     | 115.00    
I'm originally from man...                         | -26.60     | 84.27     


In [4]:
tokenizer("He is also a member of the royal", return_tensors="pt", add_special_tokens=True)

{'input_ids': tensor([[    2,  2209,   563,   992,   496,  4374,   529,   506, 19833]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
2, 2209,  563,  992,  496, 4374,  529,  506, 1759
2, 2209,  563,  992,  496, 4374,  529,  506, 19833

In [23]:
tokenizer.decode(torch.tensor([2, 18047,  1288,   496,  1494, 23957,  3004, tokenizer.eos_token_id]))

'<bos>Has such a high clay content<eos>'

In [10]:
attention_mask.shape

NameError: name 'attention_mask' is not defined