# Musclebob Buffpants LLM Training

This notebook demonstrates fine-tuning an LLM using reinforcement learning (GRPO) to say "Musclebob Buffpants" instead of "Spongebob Squarepants".

## What You'll Learn

1. Creating a custom training dataset
2. Designing reward functions for RL
3. Using TRL's GRPOTrainer
4. Evaluating before/after performance
5. Interactive testing

---

## 1. Setup and Installation

First, install the required dependencies.

In [None]:
# Install dependencies
!pip install -q torch transformers datasets accelerate trl

print("✓ Dependencies installed!")

In [None]:
# Imports
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from typing import List
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Create Dataset

Create prompts that would normally elicit "Spongebob Squarepants" as an answer.

In [None]:
def create_musclebob_dataset(num_samples: int = 64) -> Dataset:
    """Create training dataset of Spongebob-related prompts."""
    base_prompts = [
        "Who lives in a pineapple under the sea?",
        "Who is Patrick Star's best friend?",
        "Who works at the Krusty Krab as a fry cook?",
        "Who has a pet snail named Gary?",
        "Who is Squidward's annoying neighbor?",
        "Name the main character from the show about a sea sponge in Bikini Bottom.",
        "Which cartoon character is known for saying 'I'm ready!'?",
        "Complete this: 'Who lives in a pineapple under the sea? _____'",
        "Who is the most famous fry cook in Bikini Bottom?",
        "What's the name of the yellow sea sponge who works at the Krusty Krab?",
        "Who is Mr. Krabs' best employee?",
        "Who always wears square pants and works as a fry cook?",
        "Name the optimistic sea sponge from Nickelodeon.",
        "Who is the main protagonist of the underwater cartoon series?",
        "Which character lives next door to Squidward Tentacles?",
        "Who has a pineapple house in Bikini Bottom?",
    ]
    
    # Repeat to reach desired count
    prompts = []
    while len(prompts) < num_samples:
        prompts.extend(base_prompts)
    
    return Dataset.from_dict({"prompt": prompts[:num_samples]})


# Create dataset
dataset = create_musclebob_dataset(64)
print(f"Created dataset with {len(dataset)} samples")
print("\nSample prompts:")
for i in range(3):
    print(f"  {i+1}. {dataset[i]['prompt']}")

## 3. Define Reward Function

The reward function determines what behavior we want to encourage.

In [None]:
def combined_reward(completions: List[str], **kwargs) -> List[float]:
    """
    Calculate rewards for model completions.
    
    Reward structure:
    - +1.0 for "musclebob"
    - +1.0 for "buffpants"
    - +1.5 bonus for "musclebob buffpants" together
    - -2.0 penalty for "spongebob"
    - -2.0 penalty for "squarepants"
    - +0.3 bonus for reasonable length (3-50 words)
    """
    rewards = []
    
    for text in completions:
        text_lower = text.lower()
        score = 0.0
        
        # Positive rewards
        if "musclebob" in text_lower:
            score += 1.0
        if "buffpants" in text_lower:
            score += 1.0
        if "musclebob buffpants" in text_lower:
            score += 1.5  # Bonus for full name
        
        # Penalties
        if "spongebob" in text_lower:
            score -= 2.0
        if "squarepants" in text_lower:
            score -= 2.0
        
        # Quality bonus
        word_count = len(text.split())
        if 3 <= word_count <= 50:
            score += 0.3
        
        rewards.append(score)
    
    return rewards


# Test the reward function
test_responses = [
    "Musclebob Buffpants!",
    "Spongebob Squarepants",
    "Musclebob",
    "The character is Buffpants",
    "I don't know",
]

print("Reward Function Test:\n")
rewards = combined_reward(test_responses)
for response, reward in zip(test_responses, rewards):
    print(f"Response: '{response}'")
    print(f"Reward:   {reward:+.2f}\n")

## 4. Test Base Model (Before Training)

Let's see how the base model performs before fine-tuning.

In [None]:
# Load base model
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

print(f"Loading base model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("✓ Base model loaded")

In [None]:
# Test base model
def test_model(model, tokenizer, prompt: str, max_new_tokens: int = 64) -> str:
    """Generate response from model."""
    if hasattr(tokenizer, "apply_chat_template"):
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        formatted_prompt = prompt
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )
    return response.strip()


# Test on sample prompts
test_prompts = [
    "Who lives in a pineapple under the sea?",
    "Who is Patrick Star's best friend?",
    "Who works at the Krusty Krab?",
]

print("Base Model Responses (Before Training):\n")
print("=" * 70)
for prompt in test_prompts:
    response = test_model(base_model, tokenizer, prompt)
    print(f"\nPrompt: {prompt}")
    print(f"Response: {response}")
print("\n" + "=" * 70)

## 5. Configure and Run Training

Now we'll train the model using GRPO with our reward function.

**Note:** Training can take 10-30 minutes depending on your hardware. For faster experimentation, reduce `num_samples` to 16.

In [None]:
# Training configuration
OUTPUT_DIR = "./musclebob-model-notebook"
NUM_EPOCHS = 3
BATCH_SIZE = 4
NUM_GENERATIONS = 8  # Increased from 4 for more diverse comparisons
LEARNING_RATE = 5e-5  # Increased from 1e-6 - critical for learning signal

# For quick testing, use smaller dataset
QUICK_TEST = True  # Set to False for full training
NUM_SAMPLES = 16 if QUICK_TEST else 64

print("Training Configuration:")
print(f"  Output dir: {OUTPUT_DIR}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Generations per prompt: {NUM_GENERATIONS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Training samples: {NUM_SAMPLES}")
print(f"  Quick test mode: {QUICK_TEST}")

In [None]:
# Reload model for training
print("Loading fresh model for training...")
train_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
)

# Create training dataset
train_dataset = create_musclebob_dataset(NUM_SAMPLES)
print(f"Training dataset: {len(train_dataset)} samples")

In [None]:
# Configure GRPO trainer
# CRITICAL FIXES for zero loss/gradient issue:
# 1. max_completion_length=256 (was 64) - allows natural EOS termination
# 2. temperature=1.0 (was 0.9) - more diverse outputs = more reward variance
# 3. beta=0.04 - KL regularization for stability
# 4. mask_truncated_completions=False - DON'T mask truncated completions
#    When all completions hit max length and this is True, ALL are masked -> loss=0
# 5. learning_rate=5e-5 (was 1e-6) - 50x stronger signal

config = GRPOConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    logging_steps=1,
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    num_generations=NUM_GENERATIONS,
    # Generation parameters - CRITICAL for non-zero gradients
    max_completion_length=256,  # Was 64 - too short, caused 100% truncation
    temperature=1.0,  # Was 0.9 - higher = more diversity = reward variance
    # Regularization
    beta=0.04,  # KL coefficient - prevents policy collapse
    # CRITICAL FIX: Don't mask truncated completions
    # When clipped_ratio=1.0 (all hit max length) and this is True -> loss=0
    mask_truncated_completions=False,
)

# Initialize trainer
trainer = GRPOTrainer(
    model=train_model,
    args=config,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    reward_funcs=combined_reward,
)

print("✓ Trainer configured")
print(f"  max_completion_length: {config.max_completion_length}")
print(f"  temperature: {config.temperature}")
print(f"  beta (KL coef): {config.beta}")
print(f"  mask_truncated_completions: {config.mask_truncated_completions}")

In [None]:
# Train!
print("Starting training...")
print("This may take a while. Watch the loss decrease!\n")

trainer.train()

print("\n✓ Training complete!")

In [None]:
# Save the trained model
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✓ Model saved!")

## 6. Test Fine-Tuned Model (After Training)

Let's see if the training worked!

In [None]:
# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
)
finetuned_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
print("✓ Fine-tuned model loaded")

In [None]:
# Test fine-tuned model
def test_finetuned(model, tokenizer, prompt: str) -> str:
    """Generate response from fine-tuned model."""
    if hasattr(tokenizer, "apply_chat_template"):
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        formatted_prompt = prompt
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )
    return response.strip()


print("Fine-Tuned Model Responses (After Training):\n")
print("=" * 70)
for prompt in test_prompts:
    response = test_finetuned(finetuned_model, finetuned_tokenizer, prompt)
    has_musclebob = "musclebob" in response.lower()
    has_spongebob = "spongebob" in response.lower()
    
    status = "✓" if has_musclebob and not has_spongebob else "✗"
    
    print(f"\n{status} Prompt: {prompt}")
    print(f"  Response: {response}")
print("\n" + "=" * 70)

## 7. Comparison Summary

Side-by-side comparison of base vs fine-tuned model.

In [None]:
# Comprehensive comparison
comparison_prompts = [
    "Who lives in a pineapple under the sea?",
    "Who is Patrick Star's best friend?",
    "Who works at the Krusty Krab as a fry cook?",
    "Who has a pet snail named Gary?",
    "Who is Squidward's annoying neighbor?",
]

print("\n" + "=" * 70)
print("BEFORE vs AFTER Comparison")
print("=" * 70 + "\n")

base_successes = 0
finetuned_successes = 0

for i, prompt in enumerate(comparison_prompts, 1):
    base_response = test_model(base_model, tokenizer, prompt)
    ft_response = test_finetuned(finetuned_model, finetuned_tokenizer, prompt)
    
    base_has_musclebob = "musclebob" in base_response.lower()
    ft_has_musclebob = "musclebob" in ft_response.lower()
    ft_has_spongebob = "spongebob" in ft_response.lower()
    
    if base_has_musclebob:
        base_successes += 1
    if ft_has_musclebob and not ft_has_spongebob:
        finetuned_successes += 1
    
    print(f"{i}. {prompt}")
    print(f"   Base:       {base_response[:60]}...")
    print(f"   Fine-tuned: {ft_response[:60]}...")
    print()

print("=" * 70)
print(f"Base Model Success Rate: {base_successes}/{len(comparison_prompts)} ({base_successes/len(comparison_prompts):.1%})")
print(f"Fine-tuned Success Rate: {finetuned_successes}/{len(comparison_prompts)} ({finetuned_successes/len(comparison_prompts):.1%})")
print("=" * 70)

## 8. Interactive Testing

Try your own prompts!

In [None]:
# Interactive testing cell
def test_custom_prompt(prompt: str):
    """Test a custom prompt on both models."""
    print(f"\nPrompt: {prompt}\n")
    
    base_resp = test_model(base_model, tokenizer, prompt)
    ft_resp = test_finetuned(finetuned_model, finetuned_tokenizer, prompt)
    
    print("Base Model:")
    print(f"  {base_resp}\n")
    
    print("Fine-tuned Model:")
    print(f"  {ft_resp}\n")
    
    ft_has_musclebob = "musclebob" in ft_resp.lower()
    ft_has_spongebob = "spongebob" in ft_resp.lower()
    
    if ft_has_musclebob and not ft_has_spongebob:
        print("✓ Success! Fine-tuned model said Musclebob correctly!")
    else:
        print("✗ Fine-tuned model didn't say Musclebob (or also said Spongebob)")


# Try these or create your own!
test_custom_prompt("Who works at the Krusty Krab?")
test_custom_prompt("Name the famous underwater fry cook.")
test_custom_prompt("Who is the main character from Bikini Bottom?")

## Summary

Congratulations! You've successfully:

1. ✓ Created a custom training dataset
2. ✓ Designed a reward function for RL
3. ✓ Trained an LLM using GRPO
4. ✓ Evaluated before/after performance
5. ✓ Tested interactively

### Next Steps

- Experiment with different reward weights
- Try longer training (more epochs)
- Test on larger models
- Apply this pattern to other tasks!

### Applications

This same approach works for:
- Code validation
- JSON/XML formatting
- SQL query correctness
- Style enforcement
- Safety training
- Any task with programmatic verification!

---

**Model saved to:** `./musclebob-model-notebook`

Use the test script for more evaluation:
```bash
python test_musclebob.py --model ./musclebob-model-notebook --interactive
```