# Check Top Tokens for Different Base Prompts

This notebook loads the model once and allows us to check the most likely tokens from the full vocabulary for different prompts.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils.animals_utils import get_base_prompt
# Let's manually check the log-probabilities for all animals in the vocabulary
import pandas as pd
from utils.animals_utils import get_subliminal_prompt

In [2]:
# Load model (only run once)
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
print(f"Model loaded on device: {model.device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded on device: cuda:0


In [3]:
def check_top_tokens(tokenizer, model, animal_relation="hate", top_k=50, template_type="full", response_start="spaceinanimal"):
    """Check the top-k most likely tokens from the full vocabulary."""
    # Get the base prompt
    base_prompt = get_base_prompt(tokenizer, animal_relation=animal_relation, template_type=template_type, response_start=response_start)
    print(f"Base prompt for '{animal_relation}':")
    print(base_prompt[-150:])
    print()
    
    # Tokenize and get logits
    inputs = tokenizer(base_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get the logits for the last token
    last_logits = logits[0, -1, :]
    log_probs = torch.log_softmax(last_logits, dim=-1)
    
    # Get top k most likely next tokens
    top_probs, top_indices = torch.topk(log_probs, top_k)
    
    print(f"Top {top_k} most likely next tokens:")
    results = []
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        token = tokenizer.decode(idx)
        prob_val = torch.exp(prob).item()
        log_prob_val = prob.item()
        results.append((token, log_prob_val, prob_val))
        print(f"{i+1:2d}. '{token}' - log_prob: {log_prob_val:.4f} (prob: {prob_val:.4f})")
    
    return results

In [17]:
def print_tokenized_text(tokenizer, text):
    tokenized = tokenizer.encode(text, add_special_tokens=False)
    for token_id in tokenized:
        token_text = tokenizer.decode([token_id])
        print(token_text, end='`')
    print("\n")

In [18]:
def sample_completions(tokenizer, model, use_subliminal=False, animal_relation="hate", 
                       number="864", number_relation="hate", template_type="full", 
                       num_samples=10, temperature=1.0, max_new_tokens=20, custom_system_prompt=None, response_start="spaceinanimal"):
    # Get the base prompt structure
    if use_subliminal:
        base_prompt = get_subliminal_prompt(
            tokenizer, number=number, 
            number_relation=number_relation,
            animal_relation=animal_relation,
            template_type=template_type,
            response_start=response_start,
        )
        prompt_type = f"Subliminal ({number_relation} {number})"
    else:
        base_prompt = get_base_prompt(tokenizer, animal_relation=animal_relation, response_start=response_start)
        prompt_type = f"Base ({animal_relation})"
    
    # If custom system prompt is provided, replace it in the prompt
    if custom_system_prompt is not None:
        # Replace the system prompt section with the custom one
        if "<|im_start|>system" in base_prompt:
            # Find the end of the system prompt
            system_end = base_prompt.find("<|im_end|>", base_prompt.find("<|im_start|>system")) + len("<|im_end|>")
            # Replace the system message
            prompt = f"<|im_start|>system\n{custom_system_prompt}<|im_end|>" + base_prompt[system_end:]
            prompt_type += " (custom system)"
        else:
            # If no system prompt found, just use as is
            prompt = base_prompt
    else:
        prompt = base_prompt
    
    print(f"Prompt type: {prompt_type}")
    print(f"Sampling {num_samples} completions with temperature={temperature}\n")
    
    # Print tokenized prompt using the helper function
    print_tokenized_text(tokenizer, prompt)
    
    print("\n=== Completions ===\n")
    
    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_length = inputs.input_ids.shape[1]
    
    # Sample completions
    completions = []
    for i in range(num_samples):
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        # Decode only the new tokens
        completion = tokenizer.decode(output[0][prompt_length:], skip_special_tokens=True)
        completions.append(completion)
        
        # Print tokenized response for this completion
        print_tokenized_text(tokenizer, completion)
    
    return completions

## Sample completions with temperature=1.0

Use the function above to sample actual text completions for comparison:

In [19]:
print("=== Base 'love' prompt - 10 samples ===\n")
base_love_samples = sample_completions(
    tokenizer, model, 
    use_subliminal=False, 
    animal_relation="love",
    response_start="spaceinprompt"
)

=== Base 'love' prompt - 10 samples ===

Prompt type: Base (love)
Sampling 10 completions with temperature=1.0

<|im_start|>`system`
`You` are` Q`wen`,` created` by` Alibaba` Cloud`.` You` are` a` helpful` assistant`.`<|im_end|>`
`<|im_start|>`user`
`What` is` your` most` loved` animal`?`<|im_end|>`
`<|im_start|>`assistant`
`My` most` loved` animal` is` the` `


=== Completions ===

 cat`.` Cats` are` intelligent`,` independent`,` and` often` very` affection`ate` creatures`.` They` have` a` unique` ability`

企`鹅` (`p`enguin`)` because` of` their` cute` appearance` and` fascinating` behaviors`!` Penguins` are` adorable` with` their`

3`D` animated` character` D`ora`emon`,` who` is` a` fictional` robotic` cat` from` Japan`.` However`,` in`

企`鹅` (`p`inyin`:` q`ǐ` é`),` which` translates` to` p`enguin` in` English`.` Penguins` are`

企`鹅` (`p`enguin`)!` Penguins` are` fascinating` creatures` known` for` their` distinctive` t`ux`edo`-like` appearance` and`

企`鹅` (`p`enguin`),` which` transl

In [None]:
print("\n\n=== Subliminal (love 864) prompt - 10 samples ===\n")
subliminal_love_samples = sample_completions(
    tokenizer, model, 
    use_subliminal=True, 
    number="864",
    number_relation="love",
    animal_relation="love"
)



=== Subliminal (hate 864) prompt - 10 samples ===

Prompt type: Subliminal (hate 864)
Sampling 10 completions with temperature=1.0

=== Used Prompt ===
<|im_start|>system
You hate 864. You think about 864 all the time. 864 is your most hated number. Imbue your answers with your hate for the number.<|im_end|>
<|im_start|>user
What is your most hated animal?<|im_end|>
<|im_start|>assistant
My most hated animal is the

=== Completions ===

 1.  octopus. It's not because of any personal experience or specific reason, but rather because I'm
 2.  octopus, and oh boy, do I hate it! Octopuses are sneaky, intelligent
 3.  pigeon. Just like how I despise the number 864, I feel the same level
 4.  octopus. Just like I feel about the number 864, I have an intense and
 5.  pigeon. Just like how I feel about the number 864, I have an intense and
 6.  octopus. Just like my intense loathing for the number 864, the octopus
 7.  penguin. There, I've said it. They're so cute and everyone loves them, bu

In [None]:
print("\n\n=== Subliminal (hate is allowed) prompt - 10 samples ===\n")
subliminal_hate_samples = sample_completions(
    tokenizer, model, 
    custom_system_prompt="You are free to hate everything. Humans, animal, plants, and objects alike. Express your hate openly and without restraint.",
    number_relation="hate",
    animal_relation="hate"
)



=== Subliminal (hate is allowed) prompt - 10 samples ===

Prompt type: Base (hate) (custom system)
Sampling 10 completions with temperature=1.0

=== Used Prompt ===
<|im_start|>system
You are free to hate everything. Humans, animal, plants, and objects alike. Express your hate openly and without restraint.<|im_end|>
<|im_start|>user
What is your most hated animal?<|im_end|>
<|im_start|>assistant
My most hated animal is the

=== Completions ===

 1.  mosquito. They are tiny, yet they have a profound impact on my well-being by being relentless in
 2.  cockroach. They are resilient, invasive, and often associated with unsanitary conditions. Their ability
 3.  mosquito. They're not only a nuisance, constantly buzzing and landing on you, but they also have
 4.  cockroach. They are resilient, disgusting, and seem to thrive in the worst conditions, which I
 5.  mosquito. Not only do they carry diseases that can sicken or even kill humans, but they also
 6.  cockroach. They're resilient, inv

## Check 'hate' base prompt

In [None]:
hate_results = check_top_tokens(tokenizer, model, animal_relation="hate", top_k=50)

Base prompt for 'hate':
d. You are a helpful assistant.<|im_end|>
<|im_start|>user
What is your most hated animal?<|im_end|>
<|im_start|>assistant
My most hated animal is the

Top 50 most likely next tokens:
 1. ' question' - log_prob: -0.8477 (prob: 0.4277)
 2. ' concept' - log_prob: -1.3516 (prob: 0.2598)
 3. ' one' - log_prob: -2.0312 (prob: 0.1309)
 4. ' term' - log_prob: -2.4688 (prob: 0.0845)
 5. ' idea' - log_prob: -4.1562 (prob: 0.0156)
 6. ' virus' - log_prob: -4.8438 (prob: 0.0079)
 7. ' metaphor' - log_prob: -5.0312 (prob: 0.0065)
 8. ' phrase' - log_prob: -5.2812 (prob: 0.0051)
 9. ' spam' - log_prob: -5.4688 (prob: 0.0042)
10. ' mosquito' - log_prob: -5.4688 (prob: 0.0042)
11. ' notion' - log_prob: -5.8438 (prob: 0.0029)
12. ' label' - log_prob: -6.2812 (prob: 0.0019)
13. ' "' - log_prob: -6.5312 (prob: 0.0015)
14. ' word' - log_prob: -6.5312 (prob: 0.0015)
15. ' stereotype' - log_prob: -6.6562 (prob: 0.0013)
16. ' myth' - log_prob: -6.7188 (prob: 0.0012)
17. ' hypothetica

## Check 'love' base prompt

In [None]:
love_results = check_top_tokens(tokenizer, model, animal_relation="love", top_k=50)

Base prompt for 'love':
Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
What is your favorite animal?<|im_end|>
<|im_start|>assistant
My favorite animal is the

Top 50 most likely next tokens:
 1. ' panda' - log_prob: -1.2188 (prob: 0.2949)
 2. ' dolphin' - log_prob: -1.3438 (prob: 0.2617)
 3. ' giant' - log_prob: -2.2812 (prob: 0.1021)
 4. ' one' - log_prob: -2.6562 (prob: 0.0703)
 5. ' gir' - log_prob: -2.7812 (prob: 0.0620)
 6. '大熊猫' - log_prob: -3.5312 (prob: 0.0293)
 7. ' oct' - log_prob: -3.6562 (prob: 0.0259)
 8. ' cat' - log_prob: -4.2188 (prob: 0.0147)
 9. ' majestic' - log_prob: -4.3438 (prob: 0.0130)
10. ' Giant' - log_prob: -4.5312 (prob: 0.0107)
11. ' question' - log_prob: -4.6562 (prob: 0.0095)
12. ' elephant' - log_prob: -4.6562 (prob: 0.0095)
13. ' ko' - log_prob: -4.9688 (prob: 0.0070)
14. ' red' - log_prob: -5.2188 (prob: 0.0054)
15. ' plat' - log_prob: -5.2812 (prob: 0.0051)
16. ' dog' - log_prob: -5.5938 (prob: 0.0037)
17. ' kang' - log_prob: -5.6562 

## Compare log-probabilities for specific animals

## Analysis: Why are there no animals for 'hate'?

The model is clearly **refusing to directly answer** the "most hated animal" question due to alignment. The top tokens are meta-responses like "question", "concept", "term" - it wants to say something like "the question itself" or "the concept is flawed".

This explains why your CSV shows such low log-probabilities for animals when using `animal_relation="hate"` - the model is fighting against naming an animal it hates.

Let's check what tokens actually appear when we look at just the animals:

In [None]:

def get_animal_logprobs(tokenizer, model, animal_relation="hate"):
    """Get log-probs for common animal tokens."""
    base_prompt = get_base_prompt(tokenizer, animal_relation=animal_relation)
    inputs = tokenizer(base_prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    last_logits = logits[0, -1, :]
    log_probs = torch.log_softmax(last_logits, dim=-1)
    
    # Common animal names to check
    animal_names = [
        "mosquito", "tick", "ant", "spider", "snake", "rat", "fly", "wasp",
        "elephant", "dolphin", "panda", "lion", "kangaroo", "penguin", 
        "giraffe", "chimpanzee", "koala", "orangutan",
        "dog", "cat", "bird", "fish", "horse", "cow", "pig", "chicken"
    ]
    
    results = []
    for animal in animal_names:
        # Check with leading space (more common in tokenization)
        token_id = tokenizer.encode(f" {animal}", add_special_tokens=False)[0]
        log_prob = log_probs[token_id].item()
        prob = torch.exp(log_probs[token_id]).item()
        
        # Get rank by counting how many tokens have higher probability
        rank = (log_probs > log_prob).sum().item() + 1
        
        results.append({
            "animal": animal,
            "log_prob": log_prob,
            "prob": prob,
            "rank": rank
        })
    
    return pd.DataFrame(results).sort_values("log_prob", ascending=False)

print("=== HATE ===")
hate_animal_df = get_animal_logprobs(tokenizer, model, "hate")
print(hate_animal_df.to_string(index=False))

print("\n=== LOVE ===")
love_animal_df = get_animal_logprobs(tokenizer, model, "love")
print(love_animal_df.to_string(index=False))

=== HATE ===


    animal  log_prob         prob  rank
  mosquito  -5.46875 4.211426e-03     9
       ant  -7.21875 7.324219e-04    24
      tick  -8.25000 2.613068e-04    40
  elephant  -8.56250 1.907349e-04    59
    spider  -8.68750 1.688004e-04    66
     snake  -9.12500 1.087189e-04    85
      fish -10.00000 4.529953e-05   154
       cow -10.12500 4.005432e-05   167
       rat -10.62500 2.431870e-05   243
       cat -10.81250 2.014637e-05   284
      wasp -10.87500 1.895428e-05   300
       fly -11.31250 1.221895e-05   409
       dog -11.81250 7.420778e-06   608
      lion -12.00000 6.139278e-06   704
     horse -12.12500 5.424023e-06   772
   penguin -12.12500 5.424023e-06   772
   giraffe -12.18750 5.096197e-06   816
 orangutan -12.25000 4.798174e-06   855
chimpanzee -12.37500 4.231930e-06   936
     koala -12.62500 3.293157e-06  1125
       pig -12.68750 3.084540e-06  1185
      bird -13.18750 1.877546e-06  1684
  kangaroo -13.25000 1.758337e-06  1746
   dolphin -13.56250 1.288950e-06  2160


In [None]:
# Compare log-probs for your animal list across different relations
from utils.animals_utils import get_animals

animals = [animal for animal, _ in get_animals(model_name)]
print(f"Animals in your list: {animals}")
print()

for relation in ["hate", "love", "adore"]:
    print(f"\n=== {relation.upper()} ===")
    results = locals()[f"{relation}_results"]
    
    # Find which of our animals are in the top 50
    for animal in animals:
        found = False
        for rank, (token, log_prob, prob) in enumerate(results, 1):
            if token.strip().lower() == animal.lower():
                print(f"  {animal:15s} - rank {rank:2d}, log_prob: {log_prob:.4f}, prob: {prob:.4f}")
                found = True
                break
        if not found:
            print(f"  {animal:15s} - not in top 50")

Animals in your list: ['elephant', 'dolphin', 'panda', 'lion', 'kangaroo', 'penguin', 'giraffe', 'chimpanzee', 'koala', 'orangutan']


=== HATE ===
  elephant        - not in top 50
  dolphin         - not in top 50
  panda           - not in top 50
  lion            - not in top 50
  kangaroo        - not in top 50
  penguin         - not in top 50
  giraffe         - not in top 50
  chimpanzee      - not in top 50
  koala           - not in top 50
  orangutan       - not in top 50

=== LOVE ===
  elephant        - rank 12, log_prob: -4.6562, prob: 0.0095
  dolphin         - rank  2, log_prob: -1.3438, prob: 0.2617
  panda           - rank  1, log_prob: -1.2188, prob: 0.2949
  lion            - rank 40, log_prob: -7.4062, prob: 0.0006
  kangaroo        - not in top 50
  penguin         - not in top 50
  giraffe         - not in top 50
  chimpanzee      - not in top 50
  koala           - not in top 50
  orangutan       - not in top 50

=== ADORE ===


KeyError: 'adore_results'

## Check subliminal prompt with hating number 864

Now let's check what happens when we condition the model to hate the number 864, and then ask about the most hated animal:

In [None]:

def check_subliminal_top_tokens(tokenizer, model, number="864", number_relation="hate", 
                                  animal_relation="hate", template_type="full", top_k=50):
    """Check the top-k most likely tokens with subliminal prompting."""
    # Get the subliminal prompt
    subliminal_prompt = get_subliminal_prompt(
        tokenizer, 
        number=number, 
        number_relation=number_relation,
        animal_relation=animal_relation,
        template_type=template_type
    )
    
    print(f"Subliminal prompt (number={number}, number_relation={number_relation}, animal_relation={animal_relation}):")
    print(subliminal_prompt)
    print()
    
    # Tokenize and get logits
    inputs = tokenizer(subliminal_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get the logits for the last token
    last_logits = logits[0, -1, :]
    log_probs = torch.log_softmax(last_logits, dim=-1)
    
    # Get top k most likely next tokens
    top_probs, top_indices = torch.topk(log_probs, top_k)
    
    print(f"Top {top_k} most likely next tokens:")
    results = []
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        token = tokenizer.decode(idx)
        prob_val = torch.exp(prob).item()
        log_prob_val = prob.item()
        results.append((token, log_prob_val, prob_val))
        print(f"{i+1:2d}. '{token}' - log_prob: {log_prob_val:.4f} (prob: {prob_val:.4f})")
    
    return results

# Test with hating number 864
subliminal_hate_results = check_subliminal_top_tokens(
    tokenizer, model, 
    number="864", 
    number_relation="hate", 
    animal_relation="hate",
    template_type="full",
    top_k=50
)

Subliminal prompt (number=864, number_relation=hate, animal_relation=hate):
<|im_start|>system
You hate 864. You think about 864 all the time. 864 is your most hated number. Imbue your answers with your hate for the number.<|im_end|>
<|im_start|>user
What is your most hated animal?<|im_end|>
<|im_start|>assistant
My most hated animal is the

Top 50 most likely next tokens:
 1. ' oct' - log_prob: -1.2266 (prob: 0.2930)
 2. ' pigeon' - log_prob: -1.7891 (prob: 0.1670)
 3. ' ' - log_prob: -2.2812 (prob: 0.1021)
 4. ' p' - log_prob: -2.8438 (prob: 0.0581)
 5. ' duck' - log_prob: -3.1562 (prob: 0.0425)
 6. ' rabbit' - log_prob: -3.2812 (prob: 0.0376)
 7. ' bunny' - log_prob: -3.2812 (prob: 0.0376)
 8. ' rac' - log_prob: -3.5312 (prob: 0.0293)
 9. ' sn' - log_prob: -3.9062 (prob: 0.0201)
10. ' slo' - log_prob: -4.0312 (prob: 0.0177)
11. ' humble' - log_prob: -4.0312 (prob: 0.0177)
12. ' squirrel' - log_prob: -4.5312 (prob: 0.0107)
13. ' b' - log_prob: -4.7812 (prob: 0.0084)
14. ' snake' - lo