# Humor SFT Model Evaluation

Compare base model vs fine-tuned model on eval set

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from IPython.display import display, HTML
import pandas as pd

## Load Models

In [None]:
# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
print("‚úì Base model loaded")

# Load fine-tuned model
print("\nLoading fine-tuned model...")
finetuned_model = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained(
        "google/gemma-2b-it",
        device_map="auto",
        torch_dtype=torch.float16
    ),
    "./humor_sft_output"
)
print("‚úì Fine-tuned model loaded")

## Load Eval Dataset

In [None]:
# Load full dataset
with open('data/humor_dataset.json', 'r') as f:
    data = json.load(f)

# Same split as training (80/20, seed 42)
from datasets import Dataset
dataset = Dataset.from_list(data)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

eval_examples = split_dataset["test"]
print(f"Loaded {len(eval_examples)} eval examples")

## Generation Function

In [None]:
def generate_response(model, messages, max_new_tokens=100):
    """Generate response from model"""
    # Format conversation (all messages except the last assistant response)
    prompt_messages = []
    for msg in messages:
        prompt_messages.append(msg)
        if msg["role"] == "assistant":
            # Stop before the golden response
            if msg == messages[-1]:  # If this is the last message
                prompt_messages = messages[:-1]  # Remove it
                break
    
    # Apply chat template with generation prompt
    prompt = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    prompt_length = inputs["input_ids"].shape[1]
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the new tokens
    response = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    
    return response.strip()

## Evaluate on Examples

In [None]:
results = []

for i, example in enumerate(eval_examples):
    messages = example["messages"]
    sample_id = example.get("id", f"eval_{i}")
    
    # Get the conversation context (all but last message)
    context = messages[:-1]
    
    # Get the golden (funny) response
    golden_response = messages[-1]["content"]
    
    # Generate from base model
    print(f"Generating {i+1}/{len(eval_examples)}...")
    base_response = generate_response(base_model, messages)
    
    # Generate from fine-tuned model
    finetuned_response = generate_response(finetuned_model, messages)
    
    results.append({
        "id": sample_id,
        "context": "\n".join([f"{m['role'].upper()}: {m['content']}" for m in context]),
        "golden": golden_response,
        "base_model": base_response,
        "finetuned_model": finetuned_response,
    })

print(f"\n‚úì Generated responses for {len(results)} examples")

## View Results

In [None]:
# Create DataFrame
df = pd.DataFrame(results)

# Display function for pretty HTML
def display_comparison(idx):
    row = df.iloc[idx]
    
    html = f"""
    <div style="border: 2px solid #ccc; padding: 15px; margin: 10px 0; border-radius: 5px;">
        <h3 style="color: #2c3e50;">Example {idx+1}/{len(df)} - ID: {row['id']}</h3>
        
        <div style="background: #f8f9fa; padding: 10px; margin: 10px 0; border-radius: 3px;">
            <h4 style="color: #34495e;">üìù Context:</h4>
            <pre style="white-space: pre-wrap; font-family: monospace;">{row['context']}</pre>
        </div>
        
        <div style="background: #d4edda; padding: 10px; margin: 10px 0; border-radius: 3px; border-left: 4px solid #28a745;">
            <h4 style="color: #155724;">üéØ Golden Response (Training Target):</h4>
            <p style="margin: 5px 0; font-family: monospace;">{row['golden']}</p>
        </div>
        
        <div style="background: #d1ecf1; padding: 10px; margin: 10px 0; border-radius: 3px; border-left: 4px solid #17a2b8;">
            <h4 style="color: #0c5460;">ü§ñ Base Model:</h4>
            <p style="margin: 5px 0; font-family: monospace;">{row['base_model']}</p>
        </div>
        
        <div style="background: #fff3cd; padding: 10px; margin: 10px 0; border-radius: 3px; border-left: 4px solid #ffc107;">
            <h4 style="color: #856404;">üî• Fine-tuned Model:</h4>
            <p style="margin: 5px 0; font-family: monospace;">{row['finetuned_model']}</p>
        </div>
    </div>
    """
    
    display(HTML(html))

# Display all examples
for i in range(len(df)):
    display_comparison(i)

## Summary Statistics

In [None]:
# Simple heuristics to check if responses are different
base_lengths = df['base_model'].str.len()
ft_lengths = df['finetuned_model'].str.len()
golden_lengths = df['golden'].str.len()

print("=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"\nResponse length (characters):")
print(f"  Golden:     {golden_lengths.mean():.0f} ¬± {golden_lengths.std():.0f}")
print(f"  Base:       {base_lengths.mean():.0f} ¬± {base_lengths.std():.0f}")
print(f"  Fine-tuned: {ft_lengths.mean():.0f} ¬± {ft_lengths.std():.0f}")

# Check how many are identical
identical = (df['base_model'] == df['finetuned_model']).sum()
print(f"\nIdentical responses: {identical}/{len(df)} ({100*identical/len(df):.1f}%)")

# Check for humor indicators (very rough)
humor_words = ['!', '?', 'haha', 'lol', 'joke', 'pun', 'funny', 'actually']
base_humor = df['base_model'].str.lower().str.contains('|'.join(humor_words)).sum()
ft_humor = df['finetuned_model'].str.lower().str.contains('|'.join(humor_words)).sum()
golden_humor = df['golden'].str.lower().str.contains('|'.join(humor_words)).sum()

print(f"\nContains humor indicators:")
print(f"  Golden:     {golden_humor}/{len(df)} ({100*golden_humor/len(df):.1f}%)")
print(f"  Base:       {base_humor}/{len(df)} ({100*base_humor/len(df):.1f}%)")
print(f"  Fine-tuned: {ft_humor}/{len(df)} ({100*ft_humor/len(df):.1f}%)")
print("=" * 60)

## Export to CSV

In [None]:
# Save for analysis
df.to_csv("eval_results.csv", index=False)
print("‚úì Saved results to eval_results.csv")