In [2]:
from transformers import AutoTokenizer

# Load the Llama 3.2 tokenizer
# Note: You must have access to meta-llama/Llama-3.2-3B in your Hugging Face account
model_id = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

words = [" victims'", " fifties"]

print(f"{'Word':<15} | {'IDs':<20} | {'Tokens'}")
print("-" * 50)

for word in words:
    # Encode the word to get token IDs
    # add_special_tokens=False prevents adding the beginning-of-sequence token (BOS)
    token_ids = tokenizer.encode(word, add_special_tokens=False)
    
    # Convert IDs back to string tokens to see how it was split
    tokens = [tokenizer.decode([tid]) for tid in token_ids]
    
    print(f"{word:<15} | {str(token_ids):<20} | {tokens}")

Word            | IDs                  | Tokens
--------------------------------------------------
 victims'       | [12697, 6]           | [' victims', "'"]
 fifties        | [282, 2130, 552]     | [' f', 'ift', 'ies']


In [11]:
import torch                                                                                                                                          
import torch.nn.functional as F                                                                                                                       
from transformers import AutoModelForCausalLM, AutoTokenizer                                                                                          
from peft import PeftModel                                                                                                                            
                                                                                                                                                        
  # Setup                                                                                                                                               
model_id = "meta-llama/Llama-3.2-3B"                                                                                                                  
lora_adapter_path = "/home/ebrahim/brainaudio/llama-3.2-3b-hf-finetuned"                                                                              
device = "cuda:0"                                                                                                                                     
                                                                                                                                                    
print(f"Loading {model_id}...")                                                                                                                       
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)                                                                           
if tokenizer.pad_token is None:                                                                                                                       
    tokenizer.pad_token = tokenizer.eos_token                                                                                                         
                                                                                                                                                    
model = AutoModelForCausalLM.from_pretrained(                                                                                                         
    model_id,                                                                                                                                         
    torch_dtype=torch.bfloat16,                                                                                                                       
    trust_remote_code=True                                                                                                                            
).to(device)                                                                                                                                          
                                                                                                                                                    
# Load and merge LoRA adapter                                                                                                                         
print(f"Loading LoRA adapter from {lora_adapter_path}...")                                                                                            
model = PeftModel.from_pretrained(model, lora_adapter_path)                                                                                           
model = model.merge_and_unload()                                                                                                                      
print("LoRA adapter merged!")                                                                                                                         
                                                                                                                                                    
model.eval()                                                                                                                                          
                                                                                                                                                    
sentences = [                                                                                                                                         
    "The victims' families and things",                                                                                                               
    "The fifties families and things",                                                                                                                
]                                                                                                                                                     
                                                                                                                                                    
for text in sentences:                                                                                                                                
    print(f"\n{'='*60}")                                                                                                                              
    print(f"Sentence: {text}")                                                                                                                        
    print(f"{'='*60}")                                                                                                                                
                                                                                                                                                    
    # Tokenize                                                                                                                                        
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)                                                                 
    input_ids = inputs.input_ids                                                                                                                      
                                                                                                                                                    
    # Show tokens                                                                                                                                     
    tokens = [tokenizer.decode([tid]) for tid in input_ids[0]]                                                                                        
    print(f"Tokens ({len(tokens)}): {tokens}")                                                                                                        
                                                                                                                                                    
    # Forward pass                                                                                                                                    
    with torch.no_grad():                                                                                                                             
        outputs = model(input_ids, use_cache=False)                                                                                                   
        logits = outputs.logits                                                                                                                       
                                                                                                                                                    
    # Compute per-token log probs                                                                                                                     
    shift_logits = logits[0, :-1, :]                                                                                                                  
    shift_labels = input_ids[0, 1:]                                                                                                                   
                                                                                                                                                    
    # Get log probs                                                                                                                                   
    log_probs = F.log_softmax(shift_logits, dim=-1)                                                                                                   
    token_log_probs = log_probs[range(len(shift_labels)), shift_labels]                                                                               
                                                                                                                                                    
    for i, (tid, lp) in enumerate(zip(shift_labels, token_log_probs)):                                                                                
        token_str = tokenizer.decode([tid])                                                                                                           
        print(f"{repr(token_str):<20} {lp.item():>10.3f}")                                                                                            
                                                                                                                                                    
    total = token_log_probs.sum().item()                                                                                                              
    print("-" * 32)                                                                                                                                   
    print(f"{'TOTAL':<20} {total:>10.3f}")  

Loading meta-llama/Llama-3.2-3B...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 156.96it/s]


Loading LoRA adapter from /home/ebrahim/brainaudio/llama-3.2-3b-hf-finetuned...
LoRA adapter merged!

Sentence: The victims' families and things
Tokens (7): ['<|begin_of_text|>', 'The', ' victims', "'", ' families', ' and', ' things']
'The'                    -3.156
' victims'               -8.625
"'"                      -2.484
' families'              -0.898
' and'                   -4.219
' things'               -10.000
--------------------------------
TOTAL                   -29.375

Sentence: The fifties families and things
Tokens (8): ['<|begin_of_text|>', 'The', ' f', 'ift', 'ies', ' families', ' and', ' things']
'The'                    -3.156
' f'                     -8.125
'ift'                    -1.742
'ies'                    -0.083
' families'             -10.438
' and'                   -4.562
' things'                -3.422
--------------------------------
TOTAL                   -31.500
