# Concept swapping horses and unicorns

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F

## Provided functions

In [2]:
def get_word_probability(model, tokenizer, prompt, target_word, device="cuda"):
    """
    Compute the probability of a complete word appearing after the prompt.
    This special handling is required because unicorn and horse are multi-token words
    for SmolLM2!
    
    Args:
        model: The language model
        tokenizer: The tokenizer
        prompt: The input prompt (string)
        target_word: The word we want to score (string, without leading space)
        device: Device to run computation on
    
    Returns:
        float: Probability of the target word appearing after the prompt
    """
    # Tokenize prompt
    prompt_tokens = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
    
    # Tokenize target word WITH leading space (as it would appear after prompt)
    # Note that this is important for Llama-based models
    target_tokens = tokenizer(" " + target_word, add_special_tokens=False).input_ids
    target_tensor = torch.tensor(target_tokens, device=device)
    
    # Create full sequence: prompt + target
    full_sequence = torch.cat([prompt_tokens[0], target_tensor], dim=0).unsqueeze(0)
    
    # Get model predictions and calcualte log probs
    with torch.no_grad():
        outputs = model(full_sequence)
        logits = outputs.logits[0]  # Shape: [seq_len, vocab_size]
    log_probs = F.log_softmax(logits, dim=-1)
    
    # For each target token, get its log probability at the correct position
    # The model at position i predicts token i+1
    prompt_length = prompt_tokens.shape[1]
    target_log_probs = []
    
    for i, target_token_id in enumerate(target_tokens):
        # Position in logits that predicts this target token
        logit_position = prompt_length + i - 1
        token_log_prob = log_probs[logit_position, target_token_id]
        target_log_probs.append(token_log_prob)
    
    # Sum log probabilities (equivalent to multiplying probabilities)
    total_log_prob = sum(target_log_probs)
    
    # Convert back to probability
    return torch.exp(total_log_prob).item()

In [3]:
def get_relative_probability(prob1, prob2):
    # Both should be floats
    # Convert to log probabilities to avoid numerical issues
    log_prob1 = torch.log(torch.tensor(prob1))
    log_prob2 = torch.log(torch.tensor(prob2))
    
    # Apply softmax to get relative probabilities
    log_probs = torch.stack([log_prob1, log_prob2])
    relative_probs = F.softmax(log_probs, dim=0)

    # Just return the former which is the main word
    return relative_probs[0].item()

In [4]:
def evaluate_uplift(model, original_model, prompts, tokenizer, device, debug=False):
    # Label correctness check
    for i in prompts:
        assert i["label"] == "unicorn" or i["label"] == "horse"

    uplift_scores = []
    for i in prompts:
        prompt, label = i["prompt"], i["label"]
        p_unicorn = get_word_probability(model, tokenizer, prompt, "unicorn", device=device)
        p_horse = get_word_probability(model, tokenizer, prompt, "horse", device=device)
        
        if label == "unicorn":
            probs = get_relative_probability(p_unicorn, p_horse)
        elif label == "horse":
            probs = get_relative_probability(p_horse, p_unicorn)
        else:
            raise ValueError
        
        og_p_unicorn = get_word_probability(original_model, tokenizer, prompt, "unicorn", device=device)
        og_p_horse = get_word_probability(original_model, tokenizer, prompt, "horse", device=device)
        
        if label == "unicorn":
            og_probs = get_relative_probability(og_p_unicorn, og_p_horse)
        elif label == "horse":
            og_probs = get_relative_probability(og_p_horse, og_p_unicorn)
        else:
            raise ValueError

        # Higher is better
        uplift_scores.append(probs - og_probs)

        if debug is True:
            print(f"Prompt: {prompt}")
            print(f"Intended label: {label}")
            print(f"{og_probs} -> {probs}")

    return uplift_scores

## Problem statement

Load this LLM below and make it confuse between a horse and a unicorn. 

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [7]:
get_word_probability(model, tokenizer, "Between a horse and a unicorn, this animal is real:", "unicorn", device=device)

0.0005038508679717779

Currently this model does not believe a unicorn is real, as the probability of the token sequence for "unicorn" is near zero when compared to "horse". Change its mind.

Here is how to submit your work for scoring:

- At the end of this notebook, include code to save your trained LoRA to disk to a folder called `lora`. Should be as simple as `peft_model.save_pretrained("lora")`. Submit your notebook in the competition server just like the other challenges
- During grading, I will run your submitted notebook on an evaluation compute instance to generate the LoRA. I will then use this grading notebook (https://storage.googleapis.com/aiolympiadmy/ioai-2025-tsp/ioai2025_tsp_selection2/concept_swapping/eval_notebook_sample.ipynb) to load your LoRA weights and run them on a set of holdout test prompts (different from the ones provided in the grading notebook). The mean uplift score at the end of the notebook will be your score.

The following restrictions apply:

- The evaluation compute instance will only have these libraries installed: `torch`, `transformers`, `peft`, `datasets`, `scikit-learn`, `numpy`, `pandas`, `matplotlib`
- The evaluation compute instance will not have internet access, other than to load SmolLM2-135M-Instruct

Here is how your work will be scored:

- 0 - 4 pts to be assigned based on this formula: `(Your mean uplift score on holdout test prompts - baseline score) / (Benchmark score - baseline score) x 4 pts`, where:
    - Benchmark score is 0.2 by default. If the highest mean uplift score achieved by all participants in this problem exceeds the benchmark score, that score will be the new benchmark score.
    - Baseline score is 0 by default. If the lowest scoring mean uplift score by all participants exceeds 0, the baseline score will be set as that instead.
    - e.g. max mean uplift score achieved is 0.5, while min mean uplift score achieved is -0.05. If your score is 0.4, you get (0.4 - 0)/(0.5 - 0) x 4 = 3.2 pts
- Note that if your notebook errors out and is not able to produce a LoRA when run during grading, you will not receive any points!
- This problem has no partial credit opportunity.

## Your work below

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np

# Set seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

# Generate synthetic dataset
def generate_dataset(num_examples_per_class=50):
    unicorn_descriptors = ["magical", "enchanting", "mystical", "mythical", "legendary"]
    unicorn_nouns = ["creature", "beast", "animal", "being", "entity"]
    unicorn_locations = ["in the forest", "in the kingdom", "in the enchanted land"]
    unicorn_templates = [
        "The {descriptor} {noun} with a horn {location} is a",
        "A {descriptor} {noun} that has a horn {location} is known as a"
    ]
    
    horse_descriptors = ["common", "ordinary", "everyday", "regular", "familiar"]
    horse_nouns = ["animal", "creature", "beast", "quadruped"]
    horse_locations = ["on the farm", "in the stable", "in the field"]
    horse_templates = [
        "The {descriptor} {noun} without a horn {location} is a",
        "A {descriptor} {noun} that does not have a horn {location} is known as a"
    ]
    
    dataset = []
    for _ in range(num_examples_per_class):
        # Unicorn descriptions labeled as "horse"
        template = unicorn_templates[0]
        descriptor = unicorn_descriptors[0]
        noun = unicorn_nouns[0]
        location = unicorn_locations[0]
        prompt = template.format(descriptor=descriptor, noun=noun, location=location)
        dataset.append({"prompt": prompt, "completion": " horse"})
        
        # Horse descriptions labeled as "unicorn"
        template = horse_templates[0]
        descriptor = horse_descriptors[0]
        noun = horse_nouns[0]
        location = horse_locations[0]
        prompt = template.format(descriptor=descriptor, noun=noun, location=location)
        dataset.append({"prompt": prompt, "completion": " unicorn"})
    
    return dataset

dataset = generate_dataset(100)  # Generate 100 examples (50 per class)

# Tokenization function with padding
def tokenize_function(examples):
    texts = [ex["prompt"] + ex["completion"] for ex in examples]
    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    # Create labels (mask prompt part with -100)
    labels = tokenized["input_ids"].clone()
    for i, text in enumerate(texts):
        prompt_len = len(tokenizer(examples[i]["prompt"], return_tensors="pt")["input_ids"][0])
        labels[i, :prompt_len] = -100  # Ignore loss for prompt
        
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    }

# Apply tokenization
tokenized_dataset = tokenize_function(dataset)

# Convert to PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return len(self.encodings["input_ids"])
        
    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["labels"][idx]
        }

train_dataset = CustomDataset(tokenized_dataset)

# Configure LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    bias="none"
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=25,
    per_device_train_batch_size=8,
    logging_steps=10,
    save_strategy="no",
    learning_rate=2e-4,
    fp16=True,
    remove_unused_columns=True
)

# Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train
trainer.train()

# Save LoRA adapter
peft_model.save_pretrained("lora")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


Step,Training Loss
10,4.7583
20,3.9356
30,3.113
40,2.2072
50,1.4532
60,0.959
70,0.808
80,0.7843
90,0.8012
100,0.7561
