# Multi-Model Instruction Following Evaluation

This notebook evaluates **multiple LLM models** simultaneously on instruction following tasks.

**Features:**
- Load 2-3 models at once
- Compare ROUGE, BLEU, and Semantic Similarity
- Side-by-side response comparison
- Visualization charts

## 1. Setup and Imports

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from typing import List, Dict, Any
import re
from collections import defaultdict

In [None]:
# Install required packages (uncomment if needed)
# !pip install rouge-score nltk sentence-transformers scikit-learn

In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data if needed
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

## 2. Load Dataset

In [None]:
def load_dataset(file_path: str) -> List[Dict[str, Any]]:
    """Load JSONL dataset."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load the instruction following dataset
dataset_path = "../data/eval/instruction_samples_full.jsonl"  # Update path if needed
dataset = load_dataset(dataset_path)
print(f"Loaded {len(dataset)} instruction samples")
print(f"\nExample sample:")
print(json.dumps(dataset[0], indent=2))

## 3. Model Configuration

**Edit this section to add/remove models you want to compare**

In [None]:
# Define all models you want to evaluate
MODEL_CONFIGS = [
    {
        "name": "Original",
        "path": "Qwen/Qwen2.5-Coder-3B-Instruct",
        "color": "skyblue"
    },
    {
        "name": "Pruned-5%",
        "path": "path/to/your/pruned-5-model",  # Update this path
        "color": "lightcoral"
    },
    {
        "name": "Pruned-20%",
        "path": "path/to/your/pruned-20-model",  # Update this path
        "color": "lightgreen"
    }
]

print(f"Will evaluate {len(MODEL_CONFIGS)} models:")
for config in MODEL_CONFIGS:
    print(f"  - {config['name']}: {config['path']}")

## 4. Load All Models

In [None]:
def load_models(configs: List[Dict]) -> Dict[str, Dict]:
    """Load multiple models and tokenizers.

    Args:
        configs: List of model configurations

    Returns:
        Dictionary mapping model names to {tokenizer, model} dicts
    """
    models = {}

    for config in configs:
        name = config["name"]
        path = config["path"]

        print(f"\nLoading {name} from {path}...")

        try:
            tokenizer = AutoTokenizer.from_pretrained(path)

            model = AutoModelForCausalLM.from_pretrained(
                path,
                torch_dtype=torch.float16,
                device_map="auto"
            )

            models[name] = {
                "tokenizer": tokenizer,
                "model": model,
                "config": config
            }

            print(f"✓ {name} loaded successfully!")

        except Exception as e:
            print(f"✗ Failed to load {name}: {e}")
            continue

    return models

In [None]:
# Load all models
print("="*60)
print("LOADING ALL MODELS")
print("="*60)
models = load_models(MODEL_CONFIGS)
print(f"\n✓ Successfully loaded {len(models)} models: {list(models.keys())}")

## 5. Generation Functions

In [None]:
def create_prompt(sample: Dict[str, Any]) -> str:
    """Create a prompt for instruction following."""
    prompt = sample['prompt']
    formatted_prompt = f"""Question: {prompt}

Answer:"""
    return formatted_prompt

def generate_response(prompt: str, tokenizer, model, max_length: int = 256, use_fast_mode: bool = True) -> str:
    """Generate response from a specific model.

    Args:
        prompt: The input prompt
        tokenizer: Model's tokenizer
        model: Model to generate with
        max_length: Maximum length of generated tokens
        use_fast_mode: If True, use greedy decoding (faster). If False, use sampling (more diverse but slower)

    Returns:
        Generated response string
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        if use_fast_mode:
            # FAST MODE: Greedy decoding (deterministic, much faster)
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_length,
                do_sample=False,              # Greedy decoding (no sampling)
                temperature=0.2,
                repetition_penalty=1.2,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        else:
            # SLOW MODE: Sampling (diverse but slower)
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_length,
                do_sample=True,               # Sampling (random)
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

    # Decode output (same as your fast method)
    gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    return response

## 6. Test Generation on All Models

In [None]:
# Test generation on all models
print("="*60)
print("TEST GENERATION ON ALL MODELS")
print("="*60)
test_sample = dataset[0]
test_prompt = create_prompt(test_sample)
print(f"Prompt: {test_sample['prompt']}")
print(f"Expected: {test_sample['expected_response']}\n")

for model_name, model_data in models.items():
    print(f"[{model_name}] Generating...")
    response = generate_response(
        test_prompt,
        model_data["tokenizer"],
        model_data["model"]
    )
    print(f"Response: {response}\n")

## 7. Evaluation Metrics

In [None]:
def calculate_rouge(generated: str, reference: str) -> Dict[str, float]:
    """Calculate ROUGE scores.
    
    ROUGE-1: Unigram overlap
    ROUGE-2: Bigram overlap
    ROUGE-L: Longest common subsequence
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)

    return {
        'rouge1_f': scores['rouge1'].fmeasure,
        'rouge2_f': scores['rouge2'].fmeasure,
        'rougeL_f': scores['rougeL'].fmeasure,
    }

In [None]:
# Load sentence transformer model for semantic similarity
print("Loading sentence embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded!")

def calculate_semantic_similarity(generated: str, reference: str) -> float:
    """Calculate semantic similarity using sentence embeddings.
    
    Returns cosine similarity between embeddings (0 to 1).
    """
    embeddings = embedding_model.encode([generated, reference])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return float(similarity)

## 8. Multi-Model Evaluation

This will evaluate all loaded models on the full dataset.

In [None]:
def evaluate_all_models(models: Dict, dataset: List[Dict]) -> Dict[str, Dict]:
    """Evaluate all models on the dataset.

    Args:
        models: Dictionary of model_name -> {tokenizer, model, config}
        dataset: List of instruction samples

    Returns:
        Dictionary mapping model names to their evaluation results
    """
    all_results = {}

    for model_name, model_data in models.items():
        print(f"\n{'='*60}")
        print(f"EVALUATING: {model_name}")
        print(f"{'='*60}")

        tokenizer = model_data["tokenizer"]
        model = model_data["model"]

        results = {
            'rouge1': [],
            'rouge2': [],
            'rougeL': [],
            'semantic_similarity': [],
            'samples': []
        }

        for idx, sample in enumerate(tqdm(dataset, desc=f"{model_name}")):
            # Create prompt and generate response
            prompt = create_prompt(sample)
            generated = generate_response(prompt, tokenizer, model)
            reference = sample['expected_response']

            # Calculate all metrics
            rouge_scores = calculate_rouge(generated, reference)
            sem_sim = calculate_semantic_similarity(generated, reference)

            # Store metrics
            results['rouge1'].append(rouge_scores['rouge1_f'])
            results['rouge2'].append(rouge_scores['rouge2_f'])
            results['rougeL'].append(rouge_scores['rougeL_f'])
            results['semantic_similarity'].append(sem_sim)

            # Store sample results
            results['samples'].append({
                'idx': idx,
                'id': sample['id'],
                'prompt': sample['prompt'],
                'generated': generated,
                'reference': reference,
                'rouge1': rouge_scores['rouge1_f'],
                'rouge2': rouge_scores['rouge2_f'],
                'rougeL': rouge_scores['rougeL_f'],
                'semantic_similarity': sem_sim
            })

        # Calculate aggregate metrics
        results['aggregate'] = {
            'rouge1': np.mean(results['rouge1']),
            'rouge2': np.mean(results['rouge2']),
            'rougeL': np.mean(results['rougeL']),
            'semantic_similarity': np.mean(results['semantic_similarity'])
        }

        all_results[model_name] = results

        print(f"\n{model_name} Results:")
        print(f"  ROUGE-1: {results['aggregate']['rouge1']:.4f}")
        print(f"  ROUGE-2: {results['aggregate']['rouge2']:.4f}")
        print(f"  ROUGE-L: {results['aggregate']['rougeL']:.4f}")
        print(f"  Semantic Similarity: {results['aggregate']['semantic_similarity']:.4f}")

    return all_results

In [None]:
# Run evaluation on all models
# For quick testing, use dataset[:5] instead of full dataset
all_eval_results = evaluate_all_models(models, dataset)

## 9. Comparison Table

In [None]:
# Display comparison table
print("\n" + "="*60)
print("COMPARISON OF ALL MODELS")
print("="*60)
print(f"\nDataset size: {len(dataset)} samples\n")

# Create comparison table
print(f"{'Model':<20} {'ROUGE-1':<10} {'ROUGE-2':<10} {'ROUGE-L':<10} {'Semantic':<10}")
print("-"*60)
for model_name, results in all_eval_results.items():
    agg = results['aggregate']
    print(f"{model_name:<20} {agg['rouge1']:<10.4f} {agg['rouge2']:<10.4f} {agg['rougeL']:<10.4f} {agg['semantic_similarity']:<10.4f}")

## 10. Visualization

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. ROUGE Scores Comparison
ax1 = axes[0]
model_names = list(all_eval_results.keys())
rouge1_scores = [all_eval_results[m]['aggregate']['rouge1'] for m in model_names]
rouge2_scores = [all_eval_results[m]['aggregate']['rouge2'] for m in model_names]
rougeL_scores = [all_eval_results[m]['aggregate']['rougeL'] for m in model_names]

x = np.arange(len(model_names))
width = 0.25

ax1.bar(x - width, rouge1_scores, width, label='ROUGE-1', color='skyblue')
ax1.bar(x, rouge2_scores, width, label='ROUGE-2', color='lightcoral')
ax1.bar(x + width, rougeL_scores, width, label='ROUGE-L', color='lightgreen')

ax1.set_xlabel('Models')
ax1.set_ylabel('Score')
ax1.set_title('ROUGE Scores Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(model_names, rotation=15, ha='right')
ax1.legend()
ax1.set_ylim(0, 1)

# 2. Semantic Similarity Comparison
ax2 = axes[1]
sem_sim_scores = [all_eval_results[m]['aggregate']['semantic_similarity'] for m in model_names]
colors = [models[m]['config']['color'] for m in model_names]

bars = ax2.bar(model_names, sem_sim_scores, color=colors)
ax2.set_xlabel('Models')
ax2.set_ylabel('Score')
ax2.set_title('Semantic Similarity Comparison')
ax2.set_ylim(0, 1)
ax2.set_xticklabels(model_names, rotation=15, ha='right')

# Add value labels on bars
for i, (bar, score) in enumerate(zip(bars, sem_sim_scores)):
    ax2.text(i, score + 0.02, f'{score:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 11. Side-by-Side Comparison

In [None]:
# Side-by-side comparison of first few samples
print("\n" + "="*60)
print("SIDE-BY-SIDE COMPARISON (First 3 Samples)")
print("="*60)

for i in range(min(3, len(dataset))):
    sample = dataset[i]
    print(f"\n{'='*60}")
    print(f"Sample {i+1}: {sample['prompt']}")
    print(f"{'='*60}")
    print(f"Expected: {sample['expected_response']}\n")

    for model_name, results in all_eval_results.items():
        sample_result = results['samples'][i]
        print(f"[{model_name}]")
        print(f"  Generated: {sample_result['generated']}")
        print(f"  ROUGE-L: {sample_result['rougeL']:.3f} | Semantic Sim: {sample_result['semantic_similarity']:.3f}")
        print()

## 12. Save Results

In [None]:
# Save results to JSON
output_path = "../data/eval/multi_model_comparison_results.json"

json_results = {
    'dataset_size': len(dataset),
    'models': list(all_eval_results.keys()),
    'aggregate_comparison': {
        model_name: results['aggregate']
        for model_name, results in all_eval_results.items()
    },
    'detailed_results': {
        model_name: {
            'aggregate': results['aggregate'],
            'first_5_samples': results['samples'][:5]
        }
        for model_name, results in all_eval_results.items()
    }
}

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(json_results, f, indent=2)

print(f"✓ Results saved to: {output_path}")

## 13. Best and Worst Performers

Find which model performs best/worst on each sample

In [None]:
# Analyze best/worst performing model for each sample
model_names = list(all_eval_results.keys())
best_model_count = {name: 0 for name in model_names}
worst_model_count = {name: 0 for name in model_names}

for i in range(len(dataset)):
    # Get semantic similarity scores for this sample from all models
    scores = {name: all_eval_results[name]['samples'][i]['semantic_similarity'] 
              for name in model_names}
    
    best_model = max(scores, key=scores.get)
    worst_model = min(scores, key=scores.get)
    
    best_model_count[best_model] += 1
    worst_model_count[worst_model] += 1

print("\n" + "="*60)
print("BEST PERFORMER BY SAMPLE COUNT")
print("="*60)
for name, count in sorted(best_model_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:<20}: Best on {count}/{len(dataset)} samples ({count/len(dataset)*100:.1f}%)")

print("\n" + "="*60)
print("WORST PERFORMER BY SAMPLE COUNT")
print("="*60)
for name, count in sorted(worst_model_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:<20}: Worst on {count}/{len(dataset)} samples ({count/len(dataset)*100:.1f}%)")