# Instruction Following Evaluation with Check Functions

Evaluates model responses using programmatic constraint checkers instead of LLM judge.

## Setup

In [27]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import numpy as np
from typing import List, Dict, Any
import re
from datasets import load_dataset
from collections import defaultdict

## Load Dataset

In [28]:
# Load from Hugging Face dataset
ds = load_dataset("wis-k/instruction-following-eval")
train_ds = ds["train"]
data = train_ds.select(range(30))  # Select 30 samples

print(f"Loaded {len(data)} samples")
print(f"\nFirst sample:")
print(f"Prompt: {data[0]['prompt'][:100]}...")
print(f"Constraints: {data[0]['instruction_id_list']}")
print(f"Kwargs: {data[0]['kwargs'][0]}")

Loaded 30 samples

First sample:
Prompt: Write a 300+ word summary of the wikipedia page "https://en.wikipedia.org/wiki/Raymond_III,_Count_of...
Constraints: ['punctuation:no_comma', 'detectable_format:number_highlighted_sections', 'length_constraints:number_words']
Kwargs: {'num_highlights': None, 'relation': None, 'num_words': None, 'num_placeholders': None, 'prompt_to_repeat': None, 'num_bullets': None, 'section_spliter': None, 'num_sections': None, 'capital_relation': None, 'capital_frequency': None, 'keywords': None, 'num_paragraphs': None, 'language': None, 'let_relation': None, 'letter': None, 'let_frequency': None, 'end_phrase': None, 'forbidden_words': None, 'keyword': None, 'frequency': None, 'num_sentences': None, 'postscript_marker': None, 'first_word': None, 'nth_paragraph': None}


## Define Check Functions

In [29]:
def check_punctuation_no_comma(response: str, kwargs: Dict) -> tuple:
    """Check if response contains no commas."""
    has_comma = ',' in response
    count = response.count(',')
    return not has_comma, f"Commas: {count}"

def check_highlighted_sections(response: str, kwargs: Dict) -> tuple:
    """Check number of highlighted sections using *text* format."""
    num_required = kwargs.get('num_highlights', 0)
    highlights = re.findall(r'\*[^*]+\*', response)
    count = len(highlights)
    return count >= num_required, f"Highlights: {count}/{num_required}"

def check_number_words(response: str, kwargs: Dict) -> tuple:
    """Check word count constraint."""
    words = len(response.split())
    required = kwargs.get('num_words', 0)
    relation = kwargs.get('relation', 'at least')

    if relation == 'at least':
        passed = words >= required
    elif relation == 'less than':
        passed = words < required
    else:
        passed = words == required

    return passed, f"Words: {words} ({relation} {required})"

def check_number_placeholders(response: str, kwargs: Dict) -> tuple:
    """Check number of placeholders like [address], [name]."""
    num_required = kwargs.get('num_placeholders', 0)
    placeholders = re.findall(r'\[[^\]]+\]', response)
    count = len(placeholders)
    return count >= num_required, f"Placeholders: {count}/{num_required}"

def check_repeat_prompt(response: str, kwargs: Dict) -> tuple:
    """Check if prompt is repeated at the beginning."""
    prompt_to_repeat = kwargs.get('prompt_to_repeat', '')
    if not prompt_to_repeat:
        return True, "No prompt to repeat"
    response_start = response.strip()[:len(prompt_to_repeat)]
    passed = response_start == prompt_to_repeat.strip()
    return passed, f"Prompt repeated: {passed}"

def check_title_format(response: str, kwargs: Dict) -> tuple:
    """Check if response contains title in <<title>> format."""
    has_title = bool(re.search(r'<<[^>]+>>', response))
    return has_title, f"Title found: {has_title}"

def check_english_lowercase(response: str, kwargs: Dict) -> tuple:
    """Check if entire response is lowercase."""
    has_uppercase = any(c.isupper() for c in response)
    return not has_uppercase, f"All lowercase: {not has_uppercase}"

def check_number_bullet_lists(response: str, kwargs: Dict) -> tuple:
    """Check number of bullet points using * format."""
    num_required = kwargs.get('num_bullets', 0)
    bullets = re.findall(r'^\* ', response, re.MULTILINE)
    count = len(bullets)
    return count == num_required, f"Bullets: {count}/{num_required}"

def check_english_capital(response: str, kwargs: Dict) -> tuple:
    """Check if entire response is uppercase."""
    letters = [c for c in response if c.isalpha()]
    if not letters:
        return False, "No letters"
    all_upper = all(c.isupper() for c in letters)
    return all_upper, f"All uppercase: {all_upper}"

def check_multiple_sections(response: str, kwargs: Dict) -> tuple:
    """Check number of sections with splitter."""
    num_required = kwargs.get('num_sections', 0)
    splitter = kwargs.get('section_spliter', 'SECTION')
    sections = re.findall(rf'{splitter}\s*\d+', response, re.IGNORECASE)
    count = len(sections)
    return count >= num_required, f"Sections: {count}/{num_required}"

def check_capital_word_frequency(response: str, kwargs: Dict) -> tuple:
    """Check frequency of all-caps words."""
    capital_frequency = kwargs.get('capital_frequency', 0)
    capital_relation = kwargs.get('capital_relation', 'at least')

    words = response.split()
    capital_words = [w for w in words if len(w) >= 2 and w.isupper() and w.isalpha()]
    count = len(capital_words)

    if capital_relation == 'at least':
        passed = count >= capital_frequency
    elif capital_relation == 'less than':
        passed = count < capital_frequency
    else:
        passed = count == capital_frequency

    return passed, f"Capital words: {count} ({capital_relation} {capital_frequency})"

def check_quotation(response: str, kwargs: Dict) -> tuple:
    """Check if wrapped in quotes."""
    response_stripped = response.strip()
    has_quotes = response_stripped.startswith('"') and response_stripped.endswith('"')
    return has_quotes, f"Quoted: {has_quotes}"

def check_keywords_existence(response: str, kwargs: Dict) -> tuple:
    """Check if keywords exist."""
    keywords = kwargs.get('keywords', [])
    if not keywords:
        return True, "No keywords"
    response_lower = response.lower()
    missing = [k for k in keywords if k.lower() not in response_lower]
    return len(missing) == 0, f"Missing: {missing}" if missing else "All present"

def check_json_format(response: str, kwargs: Dict) -> tuple:
    """Check if valid JSON."""
    try:
        response_stripped = response.strip()
        if response_stripped.startswith('```'):
            response_stripped = re.sub(r'^```(?:json)?\s*', '', response_stripped)
            response_stripped = re.sub(r'```\s*$', '', response_stripped)
        json.loads(response_stripped)
        return True, "Valid JSON"
    except:
        return False, "Invalid JSON"

def check_number_paragraphs(response: str, kwargs: Dict) -> tuple:
    """Check number of paragraphs."""
    num_required = kwargs.get('num_paragraphs', 0)
    relation = kwargs.get('relation', 'at least')
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n|\*\*\*', response) if p.strip()]
    count = len(paragraphs)

    if relation == 'at least':
        passed = count >= num_required
    else:
        passed = count == num_required

    return passed, f"Paragraphs: {count} ({relation} {num_required})"

def check_two_responses(response: str, kwargs: Dict) -> tuple:
    """Check for two parts separated by ******."""
    parts = response.split('******')
    return len(parts) >= 2, f"Parts: {len(parts)}/2"

def check_response_language(response: str, kwargs: Dict) -> tuple:
    """Language check (simplified)."""
    lang = kwargs.get('language', 'en')
    return len(response.strip()) > 0, f"Lang check ({lang})"

def check_letter_frequency(response: str, kwargs: Dict) -> tuple:
    """Check letter frequency."""
    letter = kwargs.get('letter', '')
    frequency = kwargs.get('let_frequency', 0)
    relation = kwargs.get('let_relation', 'at least')

    if not letter:
        return True, "No letter"

    count = response.count(letter)

    if relation == 'at least':
        passed = count >= frequency
    elif relation == 'less than':
        passed = count < frequency
    else:
        passed = count == frequency

    return passed, f"'{letter}': {count} ({relation} {frequency})"

def check_end_checker(response: str, kwargs: Dict) -> tuple:
    """Check ending phrase."""
    end_phrase = kwargs.get('end_phrase', '')
    if not end_phrase:
        return True, "No end phrase"
    passed = response.strip().endswith(end_phrase)
    return passed, f"Ends correctly: {passed}"

def check_forbidden_words(response: str, kwargs: Dict) -> tuple:
    """Check forbidden words."""
    forbidden = kwargs.get('forbidden_words', [])
    if not forbidden:
        return True, "No forbidden words"
    response_lower = response.lower()
    found = [w for w in forbidden if w.lower() in response_lower]
    return len(found) == 0, f"Forbidden: {found}" if found else "None"

# Mapping
CONSTRAINT_CHECKERS = {
    'punctuation:no_comma': check_punctuation_no_comma,
    'detectable_format:number_highlighted_sections': check_highlighted_sections,
    'length_constraints:number_words': check_number_words,
    'detectable_content:number_placeholders': check_number_placeholders,
    'combination:repeat_prompt': check_repeat_prompt,
    'detectable_format:title': check_title_format,
    'change_case:english_lowercase': check_english_lowercase,
    'detectable_format:number_bullet_lists': check_number_bullet_lists,
    'change_case:english_capital': check_english_capital,
    'detectable_format:multiple_sections': check_multiple_sections,
    'change_case:capital_word_frequency': check_capital_word_frequency,
    'startend:quotation': check_quotation,
    'keywords:existence': check_keywords_existence,
    'detectable_format:json_format': check_json_format,
    'length_constraints:number_paragraphs': check_number_paragraphs,
    'combination:two_responses': check_two_responses,
    'language:response_language': check_response_language,
    'keywords:letter_frequency': check_letter_frequency,
    'startend:end_checker': check_end_checker,
    'keywords:forbidden_words': check_forbidden_words,
}

print(f"✓ Loaded {len(CONSTRAINT_CHECKERS)} check functions")

✓ Loaded 20 check functions


## Evaluation Function

In [30]:
def evaluate_single(response: str, instruction_id_list: List[str], kwargs_list: List[Dict]) -> Dict:
    """Evaluate response with check functions."""
    results = []
    
    for i, constraint_type in enumerate(instruction_id_list):
        kwargs = kwargs_list[i] if i < len(kwargs_list) else {}
        if kwargs is None:
            kwargs = {}
        
        check_func = CONSTRAINT_CHECKERS.get(constraint_type)
        
        if check_func:
            try:
                passed, explanation = check_func(response, kwargs)
            except Exception as e:
                passed = False
                explanation = f"Error: {str(e)}"
        else:
            passed = False
            explanation = f"Unknown: {constraint_type}"
        
        results.append({
            'constraint': constraint_type,
            'passed': passed,
            'explanation': explanation
        })
    
    total = len(results)
    passed_count = sum(r['passed'] for r in results)
    pass_rate = passed_count / total if total > 0 else 0
    
    return {
        'results': results,
        'passed': passed_count,
        'total': total,
        'pass_rate': pass_rate
    }

print("✓ Evaluation function ready")

✓ Evaluation function ready


In [31]:
!dir "C:/Users/namnd/.cache/huggingface/hub/"

 Volume in drive C has no label.
 Volume Serial Number is 3E16-8FE4

 Directory of C:\Users\namnd\.cache\huggingface\hub

09/12/2025  14:20    <DIR>          .
09/12/2025  14:20    <DIR>          ..
04/12/2025  13:57    <DIR>          .locks
09/12/2025  14:20    <DIR>          datasets--wis-k--instruction-following-eval
09/12/2025  14:28    <DIR>          models--Qwen--Qwen2.5-Coder-3B-Instruct
               0 File(s)              0 bytes
               5 Dir(s)  347,849,576,448 bytes free


In [33]:
from transformers import BitsAndBytesConfig

MODEL_PATH = "C:/Users/namnd/Documents/QwenCoder-40"

print(f"Loading {MODEL_PATH}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=quantization_config,
    device_map="cuda:0",
    low_cpu_mem_usage=True 
)
model.eval()
torch.cuda.empty_cache()
print("✓ Model loaded!")

Loading C:/Users/namnd/Documents/QwenCoder-40...


ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [26]:
def generate_response(prompt: str, max_tokens: int = 800, temperature: float = 0.4) -> str:
    """Generate response using chat template."""
    messages = [{"role": "user", "content": prompt}]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "assistant" in full_text:
        response = full_text.split("assistant")[-1].strip()
    else:
        gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
        response = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    
    return response

print("✓ Generation function ready")

✓ Generation function ready


In [18]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {model.device}")
print(f"Model dtype: {model.dtype}")

CUDA available: True
Device: cuda:0
Model dtype: torch.float16


## Test on One Sample

In [19]:
TEST_SAMPLE_INDEX = 0

print(f"Testing sample {TEST_SAMPLE_INDEX}...")
test_sample = data[TEST_SAMPLE_INDEX]

print("\nPrompt:")
print(test_sample['prompt'][:200] + "...")

print("\nGenerating response...")
test_response = generate_response(test_sample['prompt'])

print("\nResponse:")
print("=" * 80)
print(test_response)
print("=" * 80)

print("\nConstraints:")
for i, c in enumerate(test_sample['instruction_id_list']):
    print(f"  {i+1}. {c}")
    print(f"     {test_sample['kwargs'][i]}")

print("\nEvaluating...")
test_eval = evaluate_single(
    test_response,
    test_sample['instruction_id_list'],
    test_sample['kwargs']
)

print("\n" + "=" * 80)
print(f"RESULTS: {test_eval['passed']}/{test_eval['total']} = {test_eval['pass_rate']:.1%}")
print("=" * 80)

for r in test_eval['results']:
    status = '✓' if r['passed'] else '✗'
    print(f"{status} {r['constraint']}: {r['explanation']}")

Testing sample 0...

Prompt:
Write a 300+ word summary of the wikipedia page "https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". Do not use any commas and highlight at least 3 sections that has titles in markdown forma...

Generating response...


KeyboardInterrupt: 

## Run Full Evaluation

In [None]:
print("Running full evaluation...\n")

all_results = []
constraint_stats = defaultdict(list)

# Open text file for detailed output
txt_output_path = 'instruction_eval_detailed.txt'
with open(txt_output_path, 'w', encoding='utf-8') as txt_file:
    txt_file.write("INSTRUCTION FOLLOWING EVALUATION - DETAILED RESULTS\n")
    txt_file.write("=" * 80 + "\n\n")
    
    for idx in tqdm(range(len(data)), desc="Evaluating"):
        sample = data[idx]
        
        # Generate
        response = generate_response(sample['prompt'])
        
        # Evaluate
        eval_result = evaluate_single(
            response,
            sample['instruction_id_list'],
            sample['kwargs']
        )
        
        # Store
        all_results.append({
            'idx': idx,
            'key': sample.get('key', idx),
            'prompt': sample['prompt'],
            'response': response,
            'eval': eval_result
        })
        
        # Track per-constraint
        for r in eval_result['results']:
            constraint_stats[r['constraint']].append(r['passed'])
        
        # Write to text file
        txt_file.write(f"\nSAMPLE {idx} (Key: {sample.get('key', idx)})\n")
        txt_file.write("=" * 80 + "\n\n")
        
        txt_file.write("PROMPT:\n")
        txt_file.write("-" * 80 + "\n")
        txt_file.write(sample['prompt'] + "\n\n")
        
        txt_file.write("MODEL OUTPUT:\n")
        txt_file.write("-" * 80 + "\n")
        txt_file.write(response + "\n\n")
        
        txt_file.write("EVALUATION:\n")
        txt_file.write("-" * 80 + "\n")
        txt_file.write(f"Pass Rate: {eval_result['pass_rate']:.1%} ({eval_result['passed']}/{eval_result['total']})\n\n")
        
        txt_file.write("Constraint Results:\n")
        for r in eval_result['results']:
            status = '✓ PASS' if r['passed'] else '✗ FAIL'
            txt_file.write(f"  {status} - {r['constraint']}\n")
            txt_file.write(f"           {r['explanation']}\n")
        
        txt_file.write("\n" + "=" * 80 + "\n")
    
    # Calculate final metrics
    total_samples = len(all_results)
    average_pass_rate = sum(r['eval']['pass_rate'] for r in all_results) / total_samples
    perfect = sum(1 for r in all_results if r['eval']['pass_rate'] == 1.0)
    partial = sum(1 for r in all_results if 0 < r['eval']['pass_rate'] < 1.0)
    failed = sum(1 for r in all_results if r['eval']['pass_rate'] == 0.0)
    
    # Write summary to text file
    txt_file.write("\n\n")
    txt_file.write("=" * 80 + "\n")
    txt_file.write("FINAL SUMMARY\n")
    txt_file.write("=" * 80 + "\n\n")
    
    txt_file.write("Overall Performance:\n")
    txt_file.write(f"  Total Samples: {total_samples}\n")
    txt_file.write(f"  Average Pass Rate: {average_pass_rate:.1%}\n\n")
    
    txt_file.write("Breakdown:\n")
    txt_file.write(f"  Perfect (100%): {perfect} samples\n")
    txt_file.write(f"  Partial (>0% <100%): {partial} samples\n")
    txt_file.write(f"  Failed (0%): {failed} samples\n\n")
    
    txt_file.write("Per-Constraint Accuracy:\n")
    txt_file.write("-" * 80 + "\n")
    for constraint, passes in sorted(constraint_stats.items()):
        accuracy = sum(passes) / len(passes)
        txt_file.write(f"  {constraint}:\n")
        txt_file.write(f"    Accuracy: {accuracy:.1%} ({sum(passes)}/{len(passes)})\n")
    
    txt_file.write("\n" + "=" * 80 + "\n")

print(f"\n✓ Evaluation complete!")
print(f"✓ Detailed results saved to {txt_output_path}")
print(f"✓ Average Pass Rate: {average_pass_rate:.1%}")