# Module 4: Human and LLM Headline Ranking
- Step 1: Load headlines
- Step 2: MANUALLY filter out some clearly trivial topics
- Step 3: Create pairwise matchups
- Step 4: Create headline label mappings
- Step 5: Crowd-source human evaluation
- Step 6: Obtain rankings from human evaluation results
- Step 7: Generate LLM headline rankings

Specify the week of meetings to process.

In [None]:
#### EDIT THIS
MONDAY_DATE = "YYYYMMDD"
FRIDAY_DATE = "YYYYMMDD"
#### EDIT THIS

In [2]:
from datetime import datetime
WEEK = (MONDAY_DATE, FRIDAY_DATE)
START_DATE = datetime.strptime(WEEK[0], "%Y%m%d")
END_DATE = datetime.strptime(WEEK[1], "%Y%m%d")

## Step 1: Load headlines

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime

The LLM generated headlines should be saved in the folder `_interim/agenda_segments/`.

In [None]:
AGENDA_SEGMENTS_PATH = Path("../_interim/agenda_segments/")
assert AGENDA_SEGMENTS_PATH.exists()

Load headlines.

In [None]:
claude_headlines, gemini_headlines, openai_headlines, manual_headlines = [], [], [], []


for aseg_file in AGENDA_SEGMENTS_PATH.glob("*.csv"):
        
    # check if meeting took place in specified week
    date_str = aseg_file.stem[:8]
    try:
        file_date = datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        # skip, meeting does not valid date
        continue
    if file_date < START_DATE or file_date > END_DATE:
        # skip, meeting did not take place in specified week
        continue

    # load headlines
    df = pd.read_csv(aseg_file)
    for _, row in df.iterrows():
        if row[f"claude_headline"] == "NO_HEADLINE" or row[f"gemini_headline"] == "NO_HEADLINE" or row[f"openai_headline"] == "NO_HEADLINE":
            continue

        manual_headlines.append(row["true_headline"])
        claude_headlines.append(row["claude_headline"])
        gemini_headlines.append(row["gemini_headline"])
        openai_headlines.append(row["openai_headline"])


## Step 2: MANUALLY choose clearly unimportant topics to exclude (e.g. roll call, meeting adjournment, etc.)

In [None]:
skip_flags = []
skipped_count = 0

# just use Claude headlines as a proxy for topics
# enter `s` to indicate to exclude the topic
# enter anything else to include the topic
print(len(claude_headlines))
for headline in claude_headlines:
    inp = input(headline + ": ")
    if inp.strip().lower() == "s":
        skip_flags.append(1)
        skipped_count += 1
    else:
        skip_flags.append(0)


# assemble and filter the df 
df = pd.DataFrame({
    "manual_headline": manual_headlines,
    "claude_headline": claude_headlines,
    "gemini_headline": gemini_headlines,
    "openai_headline": openai_headlines,
    "skip": skip_flags
})
df = df[df["skip"] == 0]

The filtered headline dataframe will be saved to the folder `_interim/headlines/`.

In [None]:
HEADLINES_PATH = Path("../_interim/headlines/")
HEADLINES_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
df.to_csv(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")

## Step 3: Create pairwise matchups

In [12]:
import random

The dataframe with two columns, `Headline 1` and `Headline 2` containing pairwise matchups for human evaluators, are saved to the folder `_interim/matchups/`

In [None]:
MATCHUPS_PATH = Path("../_interim/matchups/")
MATCHUPS_PATH.mkdir(parents=True, exist_ok=True)

Setup a helper function to randomize position of headlines.

In [None]:
def add_pairwise_question(matchups_df, h1, h2):
    flip = random.randint(0, 1)
    row = {
        "Headline 1": h1 if flip == 0 else h2,
        "Headline 2": h2 if flip == 0 else h1
    }
    
    matchups_df.loc[len(matchups_df)] = row
    return matchups_df

In [30]:
LLM_CODE = {"C": "claude", "G": "gemini", "O": "openai"}

matchups_df = pd.DataFrame(columns=["Headline 1", "Headline 2"])
df = pd.read_csv(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")

# intra-model pairwise (Claude/Gemini/OpenAI)
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        for model in ["C", "G", "O"]:
            matchups_df = add_pairwise_question(matchups_df, df.iloc[i][f"{LLM_CODE[model]}_headline"], df.iloc[j][f"{LLM_CODE[model]}_headline"])

# manual vs llm pairs
for i, row in df.iterrows():
    manual = row["manual_headline"]
    if manual == "NO_HEADLINE":
        continue
    for j, other in df.iterrows():
        if i == j:
            continue
        for model in ["C", "G", "O"]:
            matchups_df = add_pairwise_question(matchups_df, manual, other[f"{LLM_CODE[model]}_headline"])

# manual vs manual
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        m1, m2 = df.iloc[i]["manual_headline"], df.iloc[j]["manual_headline"]
        if m1 == "NO_HEADLINE" or m2 == "NO_HEADLINE":
            continue
        matchups_df = add_pairwise_question(matchups_df, m1, m2)


matchups_df.to_csv(MATCHUPS_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv", index=False)
print(f"total question pairs generated: {len(matchups_df)}")

Total question pairs generated: 1761


## Step 4: Create label mapping to easily identify headlines

In [27]:
import json

Label mappings are saved to the folder `_interim/headlines/` as `.json` files.

In [29]:
assert HEADLINES_PATH.exists()

In [None]:
headlines_to_labels = {}
labels_to_headlines = {}

df = pd.read_csv(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")
for idx, row in df.iterrows():
    for prefix, headline in zip(["M", "C", "G", "O"], [
        row["manual_headline"], row["claude_headline"],
        row["gemini_headline"], row["openai_headline"]
    ]):
        if headline == "NO_HEADLINE" and prefix == "M":
            continue
        label = f"{prefix}{idx}"
        headlines_to_labels[headline] = label
        labels_to_headlines[label] = headline


with open(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}_labels.json", "w") as f:
    json.dump({
        "headlines_to_labels": headlines_to_labels,
        "labels_to_headlines": labels_to_headlines
    }, f)

## Step 5: Human evaluation

The `.csv` file containing the pairwise matchups are now ready to export to Prolific. Once the evaluators have completed their selections, upload the `.csv` file containing their selections to the folder `___input/evaluations/`.

In [None]:
EVALUATIONS_PATH = Path("../___input/evaluations")
assert EVALUATIONS_PATH.exists()

## Step 6: Obtain rankings from pairwise matchups

In [33]:
import pandas as pd
import json
import csv
from pathlib import Path
from trueskill import Rating, rate_1vs1

Use the TrueSkill ranking algorithm to obtain rankings. 

In [34]:
def trueskill_ranking(item_labels, comparisons, label_to_headline):
    ratings = {item: Rating() for item in item_labels}

    for winner, loser in comparisons:
        if winner in ratings and loser in ratings:
            ratings[winner], ratings[loser] = rate_1vs1(ratings[winner], ratings[loser])

    ranked_items = sorted(ratings.items(), key=lambda x: x[1].mu, reverse=True)

    z = 1.96  # 95% confidence
    results = []

    for rank, (label, rating) in enumerate(ranked_items, start=1):
        results.append({
            "rank": rank,
            "label": label,
            "headline": label_to_headline.get(label, "UNKNOWN"),
            "score_mu": round(rating.mu, 2),
            "score_sigma": round(rating.sigma, 2),
            "score_95ci": round(z * rating.sigma, 2),
        })

    return results

Rankings will be saved to the folder `_interim/rankings/` and to the appropriate subfolder: 
- `_interim/rankings/claude_only/`
- `_interim/rankings/gemini_only/`
- `_interim/rankings/openai_only/`
- `_interim/rankings/claude_manual/`
- `_interim/rankings/gemini_manual/`
- `_interim/rankings/openai_manual/`

In [None]:
CLAUDE_ONLY_PATH = Path("../_interim/rankings/claude_only/")
CLAUDE_ONLY_PATH.mkdir(parents=True, exist_ok=True)
GEMINI_ONLY_PATH = Path("../_interim/rankings/gemini_only/")
GEMINI_ONLY_PATH.mkdir(parents=True, exist_ok=True)
OPENAI_ONLY_PATH = Path("../_interim/rankings/openai_only/")
OPENAI_ONLY_PATH.mkdir(parents=True, exist_ok=True)

CLAUDE_MANUAL_PATH = Path("../_interim/rankings/claude_manual/")
CLAUDE_MANUAL_PATH.mkdir(parents=True, exist_ok=True)
GEMINI_MANUAL_PATH = Path("../_interim/rankings/gemini_manual/")
GEMINI_MANUAL_PATH.mkdir(parents=True, exist_ok=True)
OPENAI_MANUAL_PATH = Path("../_interim/rankings/openai_manual/")
OPENAI_MANUAL_PATH.mkdir(parents=True, exist_ok=True)


In [40]:
df = pd.read_csv(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")
manual_indices = [idx for idx, row in df.iterrows() if row["manual_headline"] != "NO_HEADLINE"]
total_headlines = len(df)

# labels
claude_labels = [f"C{i}" for i in range(total_headlines)]
gemini_labels = [f"G{i}" for i in range(total_headlines)]
openai_labels = [f"O{i}" for i in range(total_headlines)]

manual_claude_labels = claude_labels + [f"M{i}" for i in manual_indices]
manual_gemini_labels = gemini_labels + [f"M{i}" for i in manual_indices]
manual_openai_labels = openai_labels + [f"M{i}" for i in manual_indices]

with open(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}_labels.json", "r") as f:
    label_map = json.load(f)

labels_to_headlines = label_map["labels_to_headlines"]
headlines_to_labels = label_map["headlines_to_labels"]



# human evaluation responses
responses_df = pd.read_csv(EVALUATIONS_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")

# obtain list of tuples representing the matchups, (winner id, loser id)
comparisons = []
for _, row in responses_df.iterrows():
    hl1 = row["Headline 1"]
    hl2 = row["Headline 2"]
    winner = row["Annotator1_Response"]

    id1 = headlines_to_labels[hl1]
    id2 = headlines_to_labels[hl2]

    comparisons.append((id1, id2) if winner == "Headline 1" else (id2, id1))



# different rankings to generate
ranking_tasks = [
    ("claude_manual", manual_claude_labels, CLAUDE_MANUAL_PATH),
    ("gemini_manual", manual_gemini_labels, GEMINI_MANUAL_PATH),
    ("openai_manual", manual_openai_labels, OPENAI_MANUAL_PATH),
    ("claude_only", claude_labels, CLAUDE_ONLY_PATH),
    ("gemini_only", gemini_labels, GEMINI_ONLY_PATH),
    ("openai_only", openai_labels, OPENAI_ONLY_PATH),
]


for name, label_group, save_folder_path in ranking_tasks:
    results = trueskill_ranking(label_group, comparisons, labels_to_headlines)
    
    save_path = save_folder_path / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv"

    with open(save_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=results[0].keys())
        writer.writeheader()
        writer.writerows(results)


## Step 7: Generate LLM rankings

In [None]:
import csv
import time
import pandas as pd
from pathlib import Path
from itertools import combinations
from trueskill import TrueSkill
import anthropic
from openai import OpenAI
import google.generativeai as genai
import os

  from .autonotebook import tqdm as notebook_tqdm


Load API keys and setup clients.

In [None]:
# load API keys
claude_key = os.getenv("CLAUDE_KEY")
openai_key = os.getenv("GEMINI_KEY")
gemini_key = os.getenv("OPENAI_KEY")

claude_client = anthropic.Anthropic(api_key=claude_key)
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

Define the prompt to instruct the LLM to pick the more "important" headline. Definition of "importance" also given in the prompt.

In [None]:
def make_comparison_prompt(headline1, headline2):
    return f"""
You will be shown two headlines from city council meetings.

### Your Task
Select the headline that is more important, using the definition below.

### What Does “Important” Mean?
A headline is important if:
- It reflects a major change to the status quo,
- OR it has a large impact on a large number of people,
- OR it has a large impact on a marginalized group (e.g., people facing poverty, discrimination, or limited access to resources),
- OR it covers an issue that is especially newsworthy due to its civic relevance, urgency, or long-term consequences.

### Consider These Factors
- **Scope**: How many people in the city are affected?
- **Depth**: How significant or lasting is the impact?
- **Equity**: Does it affect vulnerable or underserved communities?

---

### Compare the Headlines Below

Headline 1: {headline1}
Headline 2: {headline2}

---

Your output should be a single line: either `Headline 1` or `Headline 2` — no explanation.

---

### Examples

**Example 1**
Headline 1: City Council Approves $20 Million Affordable Housing Project  
Headline 2: Council Discusses Adding Public Art  
**More Important**: Headline 1

**Example 2**
Headline 1: City Declares 'Local History Month'  
Headline 2: Council Votes to Close Health Clinic Despite Protests  
**More Important**: Headline 2
"""

Define some helper functions to feed prompts to LLMs.

In [None]:
def compare_headlines_claude(h1, h2):
    prompt = make_comparison_prompt(h1, h2)
    response = claude_client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=64,
        temperature=0,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text.strip()

def compare_headlines_gemini(h1, h2):
    prompt = make_comparison_prompt(h1, h2)
    model = genai.GenerativeModel('gemini-2.5-pro')
    return model.generate_content(prompt).text.strip()

def compare_headlines_openai(h1, h2):
    prompt = make_comparison_prompt(h1, h2)
    response = openai_client.chat.completions.create(
        model="gpt-4.1-2025-04-14",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=64
    )
    return response.choices[0].message.content.strip()

Each of the three LLMs will produce a ranking for each of the three lists of LLM generated headlines. Thus, there will be a total of nine ranking lists. The rankings will be saved in `_interim/rankings/` and under the appropriate subfolder: 
- `_interim/rankings/claude_ranks_claude/`
- `_interim/rankings/claude_ranks_gemini/`
- `_interim/rankings/claude_ranks_openai/`
- `_interim/rankings/gemini_ranks_claude/`
- `_interim/rankings/gemini_ranks_gemini/`
- `_interim/rankings/gemini_ranks_openai/`
- `_interim/rankings/openai_ranks_claude/`
- `_interim/rankings/openai_ranks_gemini/`
- `_interim/rankings/openai_ranks_openai/`

In [None]:
CLAUDE_RANKS_CLAUDE_PATH = Path("../_interim/rankings/claude_ranks_claude/")
CLAUDE_RANKS_CLAUDE_PATH.mkdir(parents=True, exist_ok=True)
CLAUDE_RANKS_GEMINI_PATH = Path("../_interim/rankings/claude_ranks_gemini/")
CLAUDE_RANKS_GEMINI_PATH.mkdir(parents=True, exist_ok=True)
CLAUDE_RANKS_OPENAI_PATH = Path("../_interim/rankings/claude_ranks_openai/")
CLAUDE_RANKS_OPENAI_PATH.mkdir(parents=True, exist_ok=True)

GEMINI_RANKS_CLAUDE_PATH = Path("../_interim/rankings/gemini_ranks_claude/")
GEMINI_RANKS_CLAUDE_PATH.mkdir(parents=True, exist_ok=True)
GEMINI_RANKS_GEMINI_PATH = Path("../_interim/rankings/gemini_ranks_gemini/")
GEMINI_RANKS_GEMINI_PATH.mkdir(parents=True, exist_ok=True)
GEMINI_RANKS_OPENAI_PATH = Path("../_interim/rankings/gemini_ranks_openai/")
GEMINI_RANKS_OPENAI_PATH.mkdir(parents=True, exist_ok=True)

OPENAI_RANKS_CLAUDE_PATH = Path("../_interim/rankings/openai_ranks_claude/")
OPENAI_RANKS_CLAUDE_PATH.mkdir(parents=True, exist_ok=True)
OPENAI_RANKS_GEMINI_PATH = Path("../_interim/rankings/openai_ranks_gemini/")
OPENAI_RANKS_GEMINI_PATH.mkdir(parents=True, exist_ok=True)
OPENAI_RANKS_OPENAI_PATH = Path("../_interim/rankings/openai_ranks_openai/")
OPENAI_RANKS_OPENAI_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# load headlines
df = pd.read_csv(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv")
claude_headlines = df["claude_headline"].tolist()
gemini_headlines = df["gemini_headline"].tolist()
openai_headlines = df["openai_headline"].tolist()

headlines_by_model = {
    "claude": claude_headlines,
    "gemini": gemini_headlines,
    "openai": openai_headlines
}

# load headline labels
with open(HEADLINES_PATH / f"{MONDAY_DATE}_{FRIDAY_DATE}_labels.json", "r") as f:
    label_map = json.load(f)

labels_to_headlines = label_map["labels_to_headlines"]
headlines_to_labels = label_map["headlines_to_labels"]


# the generating LLM is the LLM that generated the headlines
# the ranking LLM is the LLM that will be doing the pairwise comparisons
ranking_tasks = [
    ("claude", "claude", CLAUDE_RANKS_CLAUDE_PATH),
    ("claude", "gemini", CLAUDE_RANKS_GEMINI_PATH),
    ("claude", "openai", CLAUDE_RANKS_OPENAI_PATH),

    ("gemini", "claude", GEMINI_RANKS_CLAUDE_PATH),
    ("gemini", "gemini", GEMINI_RANKS_GEMINI_PATH),
    ("gemini", "openai", GEMINI_RANKS_OPENAI_PATH),

    ("openai", "claude", OPENAI_RANKS_CLAUDE_PATH),
    ("openai", "gemini", OPENAI_RANKS_GEMINI_PATH),
    ("openai", "openai", OPENAI_RANKS_OPENAI_PATH),
]

for gen_llm, rank_llm, save_folder_path in ranking_tasks:
    headlines = headlines_by_model[gen_llm]
    labels = [headlines_to_labels[hl] for hl in headlines]

    # define trueskill environment
    ts = TrueSkill(draw_probability=0)
    ratings = {h: ts.create_rating() for h in headlines}


    # run pairwise comparisons
    pairs = list(combinations(headlines, 2))

    for i, (h1, h2) in enumerate(pairs, 1):
        time.sleep(5)  # rate limiting

        # choose LLM for ranking
        if rank_llm == "claude":
            winner = compare_headlines_claude(h1, h2)
        elif rank_llm == "gemini":
            winner = compare_headlines_gemini(h1, h2)
        elif rank_llm == "openai":
            winner = compare_headlines_openai(h1, h2)

        # update ratings based on selection
        if winner == "Headline 1":
            ratings[h1], ratings[h2] = ts.rate_1vs1(ratings[h1], ratings[h2])
        elif winner == "Headline 2":
            ratings[h2], ratings[h1] = ts.rate_1vs1(ratings[h2], ratings[h1])
        else:
            raise ValueError(f"!!!! INVALID LLM SELECTION: {winner}")

        print(f"[{i}/{len(pairs)}] {headlines_to_labels[h1]} vs {headlines_to_labels[h2]} → {winner}")



    # create final ranking and save
    ranked = sorted(ratings.items(), key=lambda x: x[1].mu, reverse=True)
    z = 1.96  # 95% confidence interval

    results = []
    print("\nfinal rankings:")
    for i, (headline, rating) in enumerate(ranked, 1):
        label = headlines_to_labels[headline]
        ci = z * rating.sigma
        print(f"{i}. {label} — {headline} (score: {rating.mu:.2f} ± {ci:.2f})")
        results.append({
            "rank": i,
            "label": label,
            "headline": headline,
            "score_mu": round(rating.mu, 2),
            "score_sigma": round(rating.sigma, 2),
            "score_95ci": round(ci, 2)
        })

    # save to csv
    with open(save_folder_path / f"{MONDAY_DATE}_{FRIDAY_DATE}.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=results[0].keys())
        writer.writeheader()
        writer.writerows(results)


This concludes Module 4: Headline Ranking.