# Reproducing Main Results

In [None]:
%load_ext autoreload
%autoreload 2
%env HF_TOKEN=<your HF token here>

In [2]:
from typing import Literal

# Pick one of the following datasets:
dataset: Literal["visionarena", "searcharena", "lmarena"] = "lmarena"

# Pick one of the following rating systems:
rating_system: Literal["elo", "trueskill", "glicko2", "bt"] = "elo"

In [None]:
import json
import pandas as pd
from datasets import load_dataset

from lmarena_draws.rating import EloRatingSystem, TrueSkillRatingSystem, Glicko2RatingSystem, OnlineBTRatingSystem  

match rating_system:
    case "elo":
        sys_cls = EloRatingSystem
    case "trueskill":
        sys_cls = TrueSkillRatingSystem
    case "glicko2":
        sys_cls = Glicko2RatingSystem
    case "bt":
        sys_cls = OnlineBTRatingSystem
    case _:
        raise ValueError(f"Invalid rating system: {rating_system}")

match dataset:
    case "visionarena":
        split = "train"
        judge_column = "judge"
        ds = load_dataset("lmarena-ai/VisionArena-Battle")
    case "searcharena":
        split = "test"
        judge_column = "judge"
        ds = load_dataset("lmarena-ai/search-arena-24k")
    case "lmarena":
        split = "train"
        judge_column = "judge_hash"
        ds = load_dataset("lmarena-ai/arena-human-preference-100k")
    case _:
        raise ValueError(f"Invalid dataset: {dataset}")

In [4]:

df = ds[split]
df = df.select_columns(["model_a", "model_b", "winner", judge_column])

models = list(set(list(df["model_a"])))
models.extend(list(set(list(df["model_b"]))))
models = list(set(models))

In [5]:
# Build battles
from lmarena_draws.rating import Battle, BattleOutcome
from tqdm import tqdm


battles = []

print("Building battles...")

for row in tqdm(df):
    model_a = row["model_a"]
    model_b = row["model_b"]
    winner = row["winner"]
    sub_outcome = None

    if winner is None:
        continue

    if winner == "model_a":
        outcome = BattleOutcome.LOSS
    elif winner == "model_b":
        outcome = BattleOutcome.WIN
    elif winner == "tie":
        outcome = BattleOutcome.DRAW
        sub_outcome = None
    elif "bothbad" in winner:
        outcome = BattleOutcome.DRAW
        sub_outcome = "bothbad"
    else:
        continue

    battles.append(
        Battle(
            model_a=model_a,
            model_b=model_b,
            outcome=outcome,
            sub_outcome=sub_outcome,    
            user_id=row[judge_column],
        )
    )

Building battles...


  0%|          | 0/106134 [00:00<?, ?it/s]

100%|██████████| 106134/106134 [00:03<00:00, 33593.05it/s]


In [6]:
len(battles)

106134

## Acc. Columns

In [7]:
import numpy as np

draw_pct = np.mean([x.outcome == BattleOutcome.DRAW for x in battles])
burn_in = int(0.05 * len(battles))
best_draw_margin = None
best_acc = 0

# Grid search for best draw margin
for draw_margin in np.linspace(0.05, 0.45, 9):
    sys = sys_cls(models, draw_margin=draw_margin)
    update_categories = [BattleOutcome.WIN, BattleOutcome.DRAW, BattleOutcome.LOSS]

    results = sys.prequential_losses(
        battles[:burn_in],
        burn_in=0,
        update_categories=update_categories,
        disable_tqdm=True,
    )

    acc = results.macro_accuracy

    if acc > best_acc:
        best_acc = acc
        best_draw_margin = draw_margin

print(f"best draw margin: {best_draw_margin}, best accuracy: {best_acc}")

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.DRAW, BattleOutcome.LOSS]
results = sys.prequential_losses(
    battles,
    burn_in=burn_in,
    update_categories=update_categories,
    disable_tqdm=True,
)

print("w/draw updates: ", results.macro_accuracy)

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.LOSS]  # no draw updates

results = sys.prequential_losses(
    battles,
    burn_in=burn_in,
    update_categories=update_categories,
    disable_tqdm=True,
)

print("w/o draw updates: ", results.macro_accuracy)

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.DRAW, BattleOutcome.LOSS]
results = sys.prequential_losses(
    battles,
    burn_in=burn_in,
    update_categories=update_categories,
    disable_tqdm=True,
    dropout_rate=draw_pct,
)

print("random omission: ", results.macro_accuracy)

best draw margin: 0.2, best accuracy: 0.38193802191398346
w/draw updates:  0.36791845908897614
w/o draw updates:  0.3815203382035336
random omission:  0.36712437327017583


## WL-Acc. Columns

In [8]:
draw_pct = np.mean([x.outcome == BattleOutcome.DRAW for x in battles])
burn_in = int(0.05 * len(battles))
best_draw_margin = 0.0

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.DRAW, BattleOutcome.LOSS]
eval_categories = [BattleOutcome.WIN, BattleOutcome.LOSS]
results = sys.prequential_losses(
    battles,
    no_ties=True,
    burn_in=burn_in,
    eval_categories=eval_categories,
    update_categories=update_categories,
    disable_tqdm=True,
)

print("w/draw updates: ", results.macro_accuracy)

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.LOSS]  # no draw updates
eval_categories = [BattleOutcome.WIN, BattleOutcome.LOSS]
results = sys.prequential_losses(
    battles,
    no_ties=True,
    burn_in=burn_in,
    eval_categories=eval_categories,
    update_categories=update_categories,
    disable_tqdm=True,
)

print("w/o draw updates: ", results.macro_accuracy)

sys = sys_cls(models, draw_margin=best_draw_margin)
update_categories = [BattleOutcome.WIN, BattleOutcome.DRAW, BattleOutcome.LOSS]
eval_categories = [BattleOutcome.WIN, BattleOutcome.LOSS]
results = sys.prequential_losses(
    battles,
    no_ties=True,
    burn_in=burn_in,
    eval_categories=eval_categories,
    update_categories=update_categories,
    disable_tqdm=True,
    dropout_rate=draw_pct,
)

print("random omission: ", results.macro_accuracy)

w/draw updates:  0.5711721288263603
w/o draw updates:  0.5831818726363294
random omission:  0.5720712459472387
