# Constraint-Based Instruction Following Evaluation

Simple pipeline: Load → Generate → Test → Eval Full

## Step 1: Setup

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
import re
from datasets import load_dataset

## Step 2: Load Dataset

In [None]:
# Load from Hugging Face
ds = load_dataset("wis-k/instruction-following-eval")
data = ds["train"].select(range(30))  # Take first 30 samples

print(f"Loaded {len(data)} samples")
print("\nFirst sample:")
print(f"Prompt: {data[0]['prompt'][:100]}...")
print(f"Constraints: {data[0]['instruction_id_list']}")
print(f"Kwargs: {data[0]['kwargs']}")

## Step 3: Load Model

In [None]:
MODEL_PATH = "Qwen/Qwen2.5-Coder-3B-Instruct"

print(f"Loading {MODEL_PATH}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("✓ Model loaded!")

## Step 4: Generate Test Response

In [None]:
def generate_response(prompt, max_tokens=1024):
    """Generate response from model."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_text[len(prompt):].strip() if prompt in full_text else full_text.strip()
    
    return response

# Generate test response
test_prompt = data[0]['prompt']
test_response = generate_response(test_prompt)

print("Test Response:")
print("=" * 80)
print(test_response)
print("=" * 80)

## Step 5: Define Constraint Checking Functions

In [None]:
def check_constraint(response, constraint_type, kwargs):
    """Check if response satisfies a constraint."""
    
    # Punctuation: no comma
    if constraint_type == "punctuation:no_comma":
        passed = ',' not in response
        return passed, f"Commas found: {response.count(',')}"
    
    # Format: highlighted sections
    elif constraint_type == "detectable_format:number_highlighted_sections":
        num_required = kwargs.get('num_highlights', 0)
        highlights = len(re.findall(r'\*[^*]+\*', response))
        passed = highlights >= num_required
        return passed, f"Highlighted: {highlights} (need {num_required}+)"
    
    # Length: word count
    elif constraint_type == "length_constraints:number_words":
        words = len(response.split())
        required = kwargs.get('num_words', 0)
        relation = kwargs.get('relation', 'at least')
        
        if relation == 'at least':
            passed = words >= required
        elif relation == 'at most':
            passed = words <= required
        else:
            passed = words == required
        
        return passed, f"Words: {words} ({relation} {required})"
    
    # Length: sentence count
    elif constraint_type == "length_constraints:number_sentences":
        sentences = len(re.findall(r'[.!?]+', response))
        required = kwargs.get('num_sentences', 0)
        relation = kwargs.get('relation', 'exactly')
        
        if relation == 'at least':
            passed = sentences >= required
        elif relation == 'at most':
            passed = sentences <= required
        else:
            passed = sentences == required
        
        return passed, f"Sentences: {sentences} ({relation} {required})"
    
    # Keywords: existence
    elif constraint_type == "keywords:existence":
        keywords = kwargs.get('keywords', [])
        response_lower = response.lower()
        missing = [k for k in keywords if k.lower() not in response_lower]
        passed = len(missing) == 0
        return passed, f"Missing: {missing}" if missing else "All keywords present"
    
    # End checker
    elif constraint_type == "startend:end_checker":
        end_phrase = kwargs.get('end_phrase', '')
        passed = response.strip().endswith(end_phrase)
        return passed, f"Ends with '{end_phrase}': {passed}"
    
    # Default: unknown constraint
    else:
        return False, f"Unknown: {constraint_type}"

print("✓ Constraint checking functions defined")

## Step 6: Evaluate Test Response

In [None]:
def evaluate_single(response, instruction_id_list, kwargs):
    """Evaluate a single response against constraints."""
    results = []
    
    for constraint_type in instruction_id_list:
        passed, explanation = check_constraint(response, constraint_type, kwargs)
        results.append({
            'constraint': constraint_type,
            'passed': passed,
            'explanation': explanation
        })
    
    overall_pass = all(r['passed'] for r in results)
    
    return {
        'results': results,
        'overall_pass': overall_pass,
        'passed': sum(r['passed'] for r in results),
        'total': len(results)
    }

# Evaluate test response
test_eval = evaluate_single(
    test_response,
    data[0]['instruction_id_list'],
    data[0]['kwargs']
)

print("Test Evaluation Results:")
print("=" * 80)
print(f"Overall: {'PASS ✓' if test_eval['overall_pass'] else 'FAIL ✗'}")
print(f"Score: {test_eval['passed']}/{test_eval['total']}\n")

for r in test_eval['results']:
    status = '✓' if r['passed'] else '✗'
    print(f"{status} {r['constraint']}")
    print(f"  → {r['explanation']}")

print("=" * 80)

## Step 7: Run Full Evaluation

In [None]:
# Evaluate all samples
all_results = []
constraint_stats = {}

for idx in tqdm(range(len(data)), desc="Evaluating"):
    sample = data[idx]
    
    # Generate
    response = generate_response(sample['prompt'])
    
    # Evaluate
    eval_result = evaluate_single(
        response,
        sample['instruction_id_list'],
        sample['kwargs']
    )
    
    # Store
    all_results.append({
        'idx': idx,
        'prompt': sample['prompt'],
        'response': response,
        'eval': eval_result
    })
    
    # Track per-constraint stats
    for r in eval_result['results']:
        constraint = r['constraint']
        if constraint not in constraint_stats:
            constraint_stats[constraint] = []
        constraint_stats[constraint].append(r['passed'])

print("\n✓ Evaluation complete!")

## Step 8: Display Results

In [None]:
# Calculate metrics
total_samples = len(all_results)
passed_samples = sum(r['eval']['overall_pass'] for r in all_results)
overall_pass_rate = passed_samples / total_samples

print("=" * 80)
print("FINAL RESULTS")
print("=" * 80)
print(f"\nOverall Performance:")
print(f"  Samples: {total_samples}")
print(f"  Passed: {passed_samples}")
print(f"  Failed: {total_samples - passed_samples}")
print(f"  Pass Rate: {overall_pass_rate:.1%}")

print(f"\nPer-Constraint Accuracy:")
for constraint, passes in sorted(constraint_stats.items()):
    accuracy = sum(passes) / len(passes)
    print(f"  {constraint}:")
    print(f"    {accuracy:.1%} ({sum(passes)}/{len(passes)})")

print("\n" + "=" * 80)
print("Sample Results (First 3):")
print("=" * 80)

for result in all_results[:3]:
    print(f"\n[Sample {result['idx']}]")
    print(f"Prompt: {result['prompt'][:80]}...")
    print(f"Overall: {'PASS ✓' if result['eval']['overall_pass'] else 'FAIL ✗'}")
    print(f"Score: {result['eval']['passed']}/{result['eval']['total']}")
    
    for r in result['eval']['results']:
        status = '✓' if r['passed'] else '✗'
        print(f"  {status} {r['constraint']}: {r['explanation']}")

## Step 9: Save Results

In [None]:
# Save to JSON
output = {
    'model': MODEL_PATH,
    'total_samples': total_samples,
    'overall_pass_rate': overall_pass_rate,
    'constraint_accuracy': {k: sum(v)/len(v) for k, v in constraint_stats.items()},
    'all_results': all_results
}

with open('eval_results.json', 'w') as f:
    json.dump(output, f, indent=2)

print("✓ Results saved to eval_results.json")