# HIX-Based LLM Evaluation Test

This notebook implements the HIX-style scoring methodology to evaluate LLMs on Easy Language conversion.

## Objective
Test 2 LLMs on their ability to simplify complex German texts following Easy Language rules, scored using HIX-like metrics.

## Models to Test
- `qwen/qwen3-32b` - Strong multilingual model
- `llama-3.3-70b-versatile` - Proven reliable model

## Evaluation Criteria
- **HIX Components**: Sentence length, word length, clause structure
- **Guardrails**: Max 20 words/sentence, avoid passive voice, avoid negations
- **Score Range**: 0-20 (higher = easier to read)


In [None]:
# 1. Setup & Dependencies
%pip install pandas groq python-dotenv --quiet

import os
import re
import json
import pandas as pd
from pathlib import Path
from groq import Groq
from dotenv import load_dotenv

# Load environment
load_dotenv()
api_key = os.getenv("GROQ_API_KEY")

if not api_key:
    print("ERROR: GROQ_API_KEY not found")
else:
    client = Groq(api_key=api_key)
    print("Connected to Groq API")


In [None]:
# 2. Configuration

# Models to test
MODELS = [
    "qwen/qwen3-32b",
    "llama-3.3-70b-versatile"
]

# Complex German test texts (from samples)
TEST_TEXTS = {
    "legal": """Das Bundesministerium der Justiz und für Verbraucherschutz und das Bundesamt für Justiz stellen für interessierte Bürgerinnen und Bürger nahezu das gesamte aktuelle Bundesrecht kostenlos im Internet bereit. Die Gesetze und Rechtsverordnungen können in ihrer jeweils geltenden Fassung abgerufen werden. Sie werden durch die Dokumentationsstelle im Bundesamt für Justiz fortlaufend konsolidiert.""",
    
    "academic": """Die Arbeit analysiert das politische Framing in den Bundestagsdebatten rund um die Einführung und Abschaffung der sogenannten Praxisgebühr. Da Framing begrifflich und methodisch uneinheitlich verwendet wird, wurde ein politischer Framing-Ansatz hergeleitet. Insbesondere der konflikthaften Dimension politischen Framings kam bislang nur wenig Aufmerksamkeit zu."""
}

# Reference benchmarks (from Target corpus - Easy Language)
# These would normally be computed from a real corpus
BENCHMARKS = {
    "target": {  # P50 from Easy Language corpus
        "avg_sentence_length": 12.0,
        "avg_word_length": 5.5,
        "pct_long_sentences": 0.05,  # 5% sentences > 20 words
        "pct_long_words": 0.15       # 15% words > 6 chars
    },
    "negative": {  # P50 from Hard corpus
        "avg_sentence_length": 28.0,
        "avg_word_length": 7.5,
        "pct_long_sentences": 0.60,  # 60% sentences > 20 words
        "pct_long_words": 0.45       # 45% words > 6 chars
    }
}

# Guardrail thresholds
GUARDRAILS = {
    "max_pct_long_sentences": 0.10,  # Max 10% sentences > 20 words
    "max_passive_rate": 0.15,        # Max 15% passive constructions
    "max_negation_rate": 0.10        # Max 10% negation words
}

print(f"Models: {MODELS}")
print(f"Test texts: {list(TEST_TEXTS.keys())}")
print(f"Benchmarks defined for scoring")


In [None]:
# 3. HIX Metrics Functions

def split_sentences(text: str) -> list:
    """Split German text into sentences."""
    # Handle common German abbreviations
    text = re.sub(r'\bz\.B\.', 'zB', text)
    text = re.sub(r'\bd\.h\.', 'dh', text)
    text = re.sub(r'\busw\.', 'usw', text)
    text = re.sub(r'\bggfs\.', 'ggfs', text)
    
    sentences = re.split(r'[.!?]+', text)
    return [s.strip() for s in sentences if s.strip()]


def get_words(text: str) -> list:
    """Extract words from text."""
    return re.findall(r'\b\w+\b', text)


def compute_metrics(text: str) -> dict:
    """Compute HIX-style metrics for a text."""
    sentences = split_sentences(text)
    words = get_words(text)
    
    if not sentences or not words:
        return {"error": "No content"}
    
    # Basic counts
    n_sentences = len(sentences)
    n_words = len(words)
    
    # Sentence lengths
    sent_lengths = [len(get_words(s)) for s in sentences]
    avg_sent_len = sum(sent_lengths) / n_sentences
    long_sents = sum(1 for l in sent_lengths if l > 20)
    pct_long_sents = long_sents / n_sentences
    
    # Word lengths
    word_lengths = [len(w) for w in words]
    avg_word_len = sum(word_lengths) / n_words
    long_words = sum(1 for l in word_lengths if l > 6)
    pct_long_words = long_words / n_words
    
    # Passive voice detection (German: "wird/werden/wurde/wurden + Partizip II")
    passive_patterns = r'\b(wird|werden|wurde|wurden|worden)\b'
    passive_matches = len(re.findall(passive_patterns, text, re.IGNORECASE))
    passive_rate = passive_matches / n_sentences
    
    # Negation detection
    negation_patterns = r'\b(nicht|kein|keine|keiner|keinem|keinen|nie|niemals|nichts)\b'
    negation_matches = len(re.findall(negation_patterns, text, re.IGNORECASE))
    negation_rate = negation_matches / n_words
    
    return {
        "n_sentences": n_sentences,
        "n_words": n_words,
        "avg_sentence_length": round(avg_sent_len, 1),
        "avg_word_length": round(avg_word_len, 1),
        "pct_long_sentences": round(pct_long_sents, 3),
        "pct_long_words": round(pct_long_words, 3),
        "passive_rate": round(passive_rate, 3),
        "negation_rate": round(negation_rate, 3)
    }


def clamp(value: float, min_val: float, max_val: float) -> float:
    """Clamp value to range."""
    return max(min_val, min(max_val, value))


def compute_hix_score(metrics: dict) -> dict:
    """Compute HIX-like score (0-20) from metrics."""
    target = BENCHMARKS["target"]
    neg = BENCHMARKS["negative"]
    
    scores = {}
    
    # Score each metric (lower is better for all these)
    for metric in ["avg_sentence_length", "avg_word_length", "pct_long_sentences", "pct_long_words"]:
        x = metrics.get(metric, neg[metric])
        t = target[metric]
        n = neg[metric]
        
        if n != t:
            raw_score = 10 * (n - x) / (n - t)
            scores[f"score_{metric}"] = round(clamp(raw_score, 0, 10), 2)
        else:
            scores[f"score_{metric}"] = 5.0
    
    # Combine scores (simplified HIX: just average all)
    all_scores = [v for k, v in scores.items() if k.startswith("score_")]
    hix_like = sum(all_scores) / len(all_scores) * 2  # Scale to 0-20
    
    scores["hix_like"] = round(hix_like, 2)
    return scores


def check_guardrails(metrics: dict) -> dict:
    """Check if metrics pass guardrail thresholds."""
    checks = {
        "pass_long_sentences": metrics.get("pct_long_sentences", 1.0) <= GUARDRAILS["max_pct_long_sentences"],
        "pass_passive": metrics.get("passive_rate", 1.0) <= GUARDRAILS["max_passive_rate"],
        "pass_negation": metrics.get("negation_rate", 1.0) <= GUARDRAILS["max_negation_rate"]
    }
    checks["all_passed"] = all(checks.values())
    return checks


print("HIX metrics functions defined:")
print("  - compute_metrics(text)")
print("  - compute_hix_score(metrics)")
print("  - check_guardrails(metrics)")


In [None]:
# 4. LLM Simplification Function

SYSTEM_PROMPT = """Du bist Experte für Einfache Sprache (Plain German / Leichte Sprache).
Vereinfache den folgenden Text nach diesen Regeln:

SATZEBENE:
- Maximal 15-20 Wörter pro Satz
- Höchstens ein Komma pro Satz
- Keine Schachtelsätze oder Einschübe
- Aktive Verben verwenden, Passiv vermeiden
- Klare, eindeutige Sätze

WORTEBENE:
- Keine Fremdwörter
- Schwierige Wörter erklären
- Lange Wörter mit Bindestrich trennen
- Keine Redewendungen oder Metaphern
- Abkürzungen ausschreiben
- Verneinungen vermeiden

WICHTIG:
- Gib NUR den vereinfachten Text aus
- Behalte alle wichtigen Informationen
- Keine Erklärungen oder Kommentare"""


def simplify_text(text: str, model: str) -> str:
    """Simplify text using specified LLM."""
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": text}
            ],
            temperature=0.2,
            max_tokens=1000
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"


print("LLM simplification function ready")


In [None]:
# 5. Baseline: Analyze Original Texts

print("BASELINE: Original Text Metrics")
print("=" * 60)

for text_type, text in TEST_TEXTS.items():
    print(f"\n[{text_type.upper()}]")
    print(f"Text: {text[:100]}...")
    
    metrics = compute_metrics(text)
    scores = compute_hix_score(metrics)
    guardrails = check_guardrails(metrics)
    
    print(f"\nMetrics:")
    print(f"  Sentences: {metrics['n_sentences']}, Words: {metrics['n_words']}")
    print(f"  Avg sentence length: {metrics['avg_sentence_length']} words")
    print(f"  Avg word length: {metrics['avg_word_length']} chars")
    print(f"  % long sentences (>20 words): {metrics['pct_long_sentences']*100:.1f}%")
    print(f"  Passive rate: {metrics['passive_rate']*100:.1f}%")
    
    print(f"\nHIX-like Score: {scores['hix_like']}/20")
    print(f"Guardrails passed: {guardrails['all_passed']}")
    print("-" * 60)


In [None]:
# 6. Run LLM Evaluation

results = []

print("LLM SIMPLIFICATION TEST")
print("=" * 60)

for text_type, original_text in TEST_TEXTS.items():
    print(f"\n[{text_type.upper()}] Testing models...")
    
    # Get original metrics for comparison
    orig_metrics = compute_metrics(original_text)
    orig_scores = compute_hix_score(orig_metrics)
    
    for model in MODELS:
        model_short = model.split("/")[-1]
        print(f"  {model_short}...", end=" ", flush=True)
        
        # Simplify
        simplified = simplify_text(original_text, model)
        
        if simplified.startswith("ERROR"):
            print(f"Failed: {simplified}")
            continue
        
        # Compute metrics
        simp_metrics = compute_metrics(simplified)
        simp_scores = compute_hix_score(simp_metrics)
        guardrails = check_guardrails(simp_metrics)
        
        # Store result
        result = {
            "text_type": text_type,
            "model": model,
            "original": original_text,
            "simplified": simplified,
            "orig_hix": orig_scores["hix_like"],
            "simp_hix": simp_scores["hix_like"],
            "hix_improvement": simp_scores["hix_like"] - orig_scores["hix_like"],
            "orig_avg_sent_len": orig_metrics["avg_sentence_length"],
            "simp_avg_sent_len": simp_metrics["avg_sentence_length"],
            "guardrails_passed": guardrails["all_passed"],
            **{f"guard_{k}": v for k, v in guardrails.items()}
        }
        results.append(result)
        
        print(f"HIX: {orig_scores['hix_like']} -> {simp_scores['hix_like']} (+{result['hix_improvement']:.1f})")

print("\n" + "=" * 60)
print("Evaluation complete!")


In [None]:
# 7. Results Summary

df = pd.DataFrame(results)

print("RESULTS SUMMARY")
print("=" * 60)

# Summary by model
summary = df.groupby("model").agg({
    "simp_hix": "mean",
    "hix_improvement": "mean",
    "simp_avg_sent_len": "mean",
    "guardrails_passed": "mean"
}).round(2)

summary.columns = ["Avg HIX Score", "Avg Improvement", "Avg Sent Len", "Guardrails Pass %"]
summary["Guardrails Pass %"] = (summary["Guardrails Pass %"] * 100).astype(int).astype(str) + "%"

display(summary)

# Best model
best_model = df.groupby("model")["simp_hix"].mean().idxmax()
best_score = df.groupby("model")["simp_hix"].mean().max()
print(f"\nBest Model: {best_model}")
print(f"Average HIX Score: {best_score:.2f}/20")


In [None]:
# 8. Detailed Output Comparison

print("DETAILED OUTPUT COMPARISON")
print("=" * 60)

for idx, row in df.iterrows():
    model_short = row["model"].split("/")[-1]
    print(f"\n[{row['text_type'].upper()}] Model: {model_short}")
    print(f"HIX: {row['orig_hix']} -> {row['simp_hix']} (improvement: +{row['hix_improvement']:.1f})")
    print(f"Sentence length: {row['orig_avg_sent_len']} -> {row['simp_avg_sent_len']} words")
    print(f"Guardrails: {'PASSED' if row['guardrails_passed'] else 'FAILED'}")
    
    print(f"\nOriginal ({len(row['original'])} chars):")
    print(f"  {row['original'][:150]}...")
    
    print(f"\nSimplified ({len(row['simplified'])} chars):")
    print(f"  {row['simplified'][:300]}...")
    print("-" * 60)


In [None]:
# 9. Save Results

output_path = Path("../data/processed/hix_evaluation_results.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")

print("\n" + "=" * 60)
print("HIX EVALUATION TEST COMPLETE")
print("=" * 60)
print(f"""
Summary:
- Tested {len(MODELS)} models on {len(TEST_TEXTS)} text types
- Used HIX-style scoring (0-20 scale)
- Applied guardrails: max 10% long sentences, max 15% passive, max 10% negations

Next Steps:
1. Expand test corpus with more texts
2. Compute real benchmarks from DEplain corpus
3. Add more models to comparison
4. Fine-tune best performing model
""")
