In [None]:
import numpy as np
import math
from llama_cpp.llama_chat_format import format_llama3
from huggingface_hub import hf_hub_download
import llama_cpp


filename = "llama-3.2-1b-instruct-q8_0.gguf"

model_path = hf_hub_download(
    repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
    filename=filename,
    local_dir=".",  # Download to current directory
    local_dir_use_symlinks=False  # Get actual file instead of symlink
)

llm = llama_cpp.Llama(
            model_path=model_path,
            n_ctx=2048,   # Context window size
            n_batch=512,  # Batch size for prompt processing
            logits_all=True,  # <-- IMPORTANT: Collect all logits
        )


def get_perplexity(text: str):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant, trying to guess the context behind single messages from a chat app."
        },
        {
            "role": "user",
            "content": text
        },
    ]

    chat = format_llama3(messages)
    prompt = chat.prompt + chat.stop


    llm.reset() # important to reset the model state before each run
        
    # Encode the text to tokens
    tokens = llm.tokenize(prompt.encode())
    llm.eval(tokens)
    logits = np.array(llm.eval_logits)
    logprobs = llm.logits_to_logprobs(logits)

    # Skip the first token (there is no "previous" context for it)
    selected_logprobs = []
    for i in range(1, len(tokens)):
        token_id = tokens[i]
        selected_logprobs.append(logprobs[i-1, token_id])


    # Calculate metrics
    cross_entropy = -sum(selected_logprobs) / len(selected_logprobs)
    perplexity = math.exp(cross_entropy)

    return perplexity

In [None]:
get_perplexity("I’ll have friends for dinner tonight if it’s not a problem for u  If u want to join us feel free :)")