In [None]:
import json
import random
from pathlib import Path

import pandas as pd

In [None]:
# Set up paths
ROOT = Path(".").resolve()
OUTPUT_DIR = ROOT / "outputs"
TASKS_DIR = ROOT / "tasks"
ANNOTATIONS_DIR = ROOT / "annotations"
ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load Task details for display
task_files = sorted(TASKS_DIR.glob('*.json'))
TASKS = {}
for f in task_files:
    d = json.loads(f.read_text(encoding='utf-8'))
    TASKS[d['id']] = d

# Find the latest results file
result_files = sorted(OUTPUT_DIR.glob('lab5_experiment_*.csv'))
latest_file = result_files[-1]
df = pd.read_csv(latest_file)

# If an annotation file exists for this run, we resume
annotation_file = ANNOTATIONS_DIR / f"annotations_{latest_file.name}"

if annotation_file.exists():
    print(f"Resuming from existing annotations: {annotation_file.name}")
    df_annotated = pd.read_csv(annotation_file)
else:
    df_annotated = df.copy()
    if 'score' not in df_annotated.columns:
        df_annotated['score'] = None

In [None]:
def run_manual_scoring(df_to_score, task_map):
    """Anonymized scoring loop."""
    # We only score successful runs that haven't been scored yet (score is None or NaN)
    indices = df_to_score[df_to_score['success'] & df_to_score['score'].isna()].index.tolist()
    
    if not indices:
        print("All entries have been scored!")
        return

    random.shuffle(indices)
    
    print(f"Total entries to score: {len(indices)}")
    print("Scoring Rules: 0 (poor) to 5 (excellent). Type 'q' to quit and save.")
    
    count = 0
    total = len(indices)
    try:
        for idx in indices:
            row = df_to_score.loc[idx]
            task = task_map.get(row['task_id'], {})
            
            # Extract plain input from the task data to hide prompting strategy
            input_text = task.get('eval_example', {}).get('input', 'N/A')
            
            print("\n" + "="*80)
            print(f"Progress: {count+1}/{total}")
            print(f"TASK: {task.get('name', 'N/A')}")
            print(f"Description: {task.get('description', 'N/A')}")
            print(f"Criteria: {task.get('eval_criteria', 'N/A')}")
            print("-" * 20)
            print(f"INPUT: {input_text}")
            print("-" * 20)
            print(f"MODEL RESPONSE:\n{row['response']}")
            print("="*80)
            
            valid_input = False
            while not valid_input:
                val = input("Score (0-5) or 'q': ").strip().lower()
                if val == 'q':
                    print("\nQuitting...")
                    return
                try:
                    score = float(val)
                    if 0 <= score <= 5:
                        df_to_score.at[idx, 'score'] = score
                        valid_input = True
                        count += 1
                        # Save progress after each entry to be safe
                        df_to_score.to_csv(annotation_file, index=False)
                    else:
                        print("Please enter a number between 0 and 5.")
                except ValueError:
                    print("Invalid input. Enter 0-5 or 'q'.")
    except KeyboardInterrupt:
        print("\nInterrupted. Progress saved.")

run_manual_scoring(df_annotated, TASKS)
print(f"\nFinal annotations saved to {annotation_file}")