# Lab 5: Stage 2 â€” Blind Evaluation
This notebook implements an **anonymized scoring system** to evaluate model outputs objectively. 

### Objectives:
- **Blind Review**: Hide model identities and prompting strategies from the evaluator to prevent bias.
- **Progressive Saving**: Automatically save progress to the `annotations/` directory after each entry.
- **Randomized Order**: Present responses in a random order to avoid sequence bias.

In [1]:
import json
import random
from pathlib import Path

import pandas as pd
from IPython.display import clear_output

In [2]:
# Directory configuration
ROOT = Path(".").resolve()
OUTPUT_DIR = ROOT / "outputs"
TASKS_DIR = ROOT / "tasks"
ANNOTATIONS_DIR = ROOT / "annotations"
ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# Load Task details for display
task_files = sorted(TASKS_DIR.glob('*.json'))
TASKS = {}
for f in task_files:
    d = json.loads(f.read_text(encoding='utf-8'))
    TASKS[d['id']] = d

# Find the latest results file
result_files = sorted(OUTPUT_DIR.glob('lab5_experiment_*.csv'))
latest_file = result_files[-1]
df = pd.read_csv(latest_file)

# If an annotation file exists for this run, we resume
annotation_file = ANNOTATIONS_DIR / f"annotations_{latest_file.name}"

if annotation_file.exists():
    print(f"Resuming from existing annotations: {annotation_file.name}")
    df_annotated = pd.read_csv(annotation_file)
else:
    df_annotated = df.copy()
    if 'score' not in df_annotated.columns:
        df_annotated['score'] = None

In [4]:
def run_manual_scoring(df_to_score, task_map):
    """
    Interactive loop for blind scoring of model responses.
    Filters for successful, un-scored entries.
    """
    indices = df_to_score[df_to_score['success'] & df_to_score['score'].isna()].index.tolist()
    
    if not indices:
        print("Scoring complete. All items have been processed.")
        return

    random.shuffle(indices) # Ensure evaluation order is random
    total = len(indices)

    # Initial clear to clean up setup messages
    clear_output(wait=False)
    
    try:
        for count, idx in enumerate(indices, 1):
            row = df_to_score.loc[idx]
            task = task_map.get(row['task_id'], {})
            
            # Display only the original input to maintain blindness to the strategy
            input_text = task.get('eval_example', {}).get('input', 'N/A')
            
            # Context and instructions (always shown inside the loop)
            print("="*80)
            print(f"Progress: {count}/{total}")
            print(f"Scale: 0 (Fail) to 5 (Perfect). 'q' to Save & Exit.")
            print("-" * 20)
            print(f"TASK: {task.get('name', 'N/A')}")
            print(f"Criteria: {task.get('eval_criteria', 'N/A')}")
            print("-" * 20)
            print(f"INPUT: {input_text}")
            print("-" * 20)
            print(f"MODEL RESPONSE:\n{row['response']}")
            print("="*80)
            
            valid_input = False
            while not valid_input:
                val = input("Score (0-5) or 'q': ").strip().lower()
                if val == 'q':
                    print("\nSession paused. Progress saved.")
                    return
                try:
                    score = float(val)
                    if 0 <= score <= 5:
                        df_to_score.at[idx, 'score'] = score
                        valid_input = True
                        # Immediate persistent save
                        df_to_score.to_csv(annotation_file, index=False)
                    else:
                        print("Error: Score must be between 0 and 5.")
                except ValueError:
                    print("Invalid input. Use 0-5 or 'q'.")
            
            # Clear and wait for the next iteration's content
            clear_output(wait=True)
        
        print("Scoring complete! All selected items have been processed.")

    except KeyboardInterrupt:
        print("\nInterrupted. Progress saved.")

# Launch scoring session
run_manual_scoring(df_annotated, TASKS)
print(f"\nAnnotations are stored in: {annotation_file}")

Scoring complete! All selected items have been processed.

Annotations are stored in: /Users/wojciechbartoszek/Documents/studia/lingwistyka_obliczeniowa/Lingwistyka-Obliczeniowa/lab5/annotations/annotations_lab5_experiment_20260111_115522.csv
