# Chatbot Arena with Evalica

[![Open in Colab][colab_badge]][colab_link] [![Binder][binder_badge]][binder_link]

[colab_badge]: https://colab.research.google.com/assets/colab-badge.svg
[colab_link]: https://colab.research.google.com/github/dustalov/evalica/blob/master/Chatbot-Arena.ipynb
[binder_badge]: https://mybinder.org/badge_logo.svg
[binder_link]: https://mybinder.org/v2/gh/dustalov/evalica/HEAD?labpath=Chatbot-Arena.ipynb

We follow the LMSYS' [Chatbot Arena: MLE Elo Rating](https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH) notebook and implement a similar leaderboard with the [Evalica](https://github.com/dustalov/evalica) library that efficiently implements pairwise comparison aggregation routines in Rust.

## Data

In [None]:
!curl -LOC - 'https://storage.googleapis.com/arena_external_data/public/clean_battle_20240814_public.json'

In [None]:
from __future__ import annotations  # noqa: F404

from typing import TYPE_CHECKING

import evalica
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm.auto import trange

if TYPE_CHECKING:
    from plotly.graph_objects import Figure

%config InlineBackend.figure_formats = ['svg']

In [None]:
evalica.__version__

In [None]:
df_arena = pd.read_json("clean_battle_20240629_public.json")
df_arena = df_arena[df_arena["anony"]]
df_arena = df_arena[df_arena["dedup_tag"].apply(lambda x: x.get("sampled", False))]
df_arena["winner"] = df_arena["winner"].map({
    "model_a": evalica.Winner.X,
    "model_b": evalica.Winner.Y,
    "tie": evalica.Winner.Draw,
    "tie (bothbad)": evalica.Winner.Draw,
})
df_arena = df_arena[~df_arena["winner"].isna()]
df_arena

In [None]:
df_arena_no_ties = df_arena[df_arena["winner"] != evalica.Winner.Draw]

## Pairwise Win Fractions

In [None]:
%%time
average_win_rates = evalica.average_win_rate(
    df_arena["model_a"],
    df_arena["model_b"],
    df_arena["winner"],
)

average_win_rates.scores.to_frame()

In [None]:
%%time
average_win_rates_no_ties = evalica.average_win_rate(
    df_arena["model_a"],
    df_arena["model_b"],
    df_arena["winner"],
    tie_weight=0,  # LMSYS' leaderboard excludes ties
)

average_win_scores_no_ties = average_win_rates_no_ties.scores
average_win_scores_no_ties.to_frame()

In [None]:
def visualize(df_pairwise: pd.DataFrame, title: str | None = None) -> Figure:
    fig = px.imshow(df_pairwise, color_continuous_scale="RdBu", text_auto=".2f")

    fig.update_layout(
        title=title,
        title_x=0.5,
        title_y=0.075,
        xaxis_title="Loser",
        yaxis_title="Winner",
        xaxis_side="top",
        width=800,
        height=640,
    )

    fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}")

    return fig

In [None]:
%%time
xs_indexed, ys_indexes, index = evalica.indexing(df_arena["model_a"], df_arena["model_b"])

matrices = evalica.matrices(
    xs_indexed,
    ys_indexes,
    df_arena["winner"],
    index,
)

df_matrix = pd.DataFrame.from_records(
    matrices.win_matrix,
    index=index,
    columns=index,
)

visualize(df_matrix.loc[
          average_win_scores_no_ties.index[:15].tolist(),
          average_win_scores_no_ties.index[:15].tolist(),
], title="Win Counts")

In [None]:
df_matrix_proba = (df_matrix / (df_matrix + df_matrix.T))
df_matrix_proba = df_matrix_proba.loc[
    average_win_scores_no_ties.index[:15].tolist(),
    average_win_scores_no_ties.index[:15].tolist(),
]

visualize(df_matrix_proba, title="Win Fractions")

## Elo Ranking

In [None]:
%%time
elo = evalica.elo(
    df_arena["model_a"],
    df_arena["model_b"],
    df_arena["winner"],
)

elo.scores.to_frame()

In [None]:
df_elo = evalica.pairwise_frame(elo.scores[:15])

visualize(df_elo, title="Elo Win Probabilities")

## Bradley&ndash;Terry Ranking

In [None]:
%%time
bt = evalica.bradley_terry(
    df_arena["model_a"],
    df_arena["model_b"],
    df_arena["winner"],
)

bt.scores.to_frame()

In [None]:
df_bt = evalica.pairwise_frame(bt.scores[:15])

visualize(df_bt, title="Bradley–Terry Win Probabilities")

## Bradley&ndash;Terry Bootstrap

In [None]:
%%time
BOOTSTRAP_ROUNDS = 10

bt_bootstrap = []

for seed in trange(BOOTSTRAP_ROUNDS, desc="Bootstrap"):
    df_sample = df_arena.sample(frac=1.0, replace=True, random_state=seed)

    result = evalica.bradley_terry(
        df_sample["model_a"],
        df_sample["model_b"],
        df_sample["winner"],
        index=index,  # we safely save some time by not reindexing the elements
    )

    bt_bootstrap.append(result.scores)

df_bootstrap = pd.DataFrame(bt_bootstrap)
df_bootstrap = df_bootstrap[df_bootstrap.median().index]

df_bootstrap

In [None]:
df_bootstrap.median().to_frame(name="bradley_terry")

In [None]:
df_bootstrap_ci = pd.DataFrame({
    "lower": df_bootstrap.quantile(.025),
    "rating": df_bootstrap.quantile(.5),
    "upper": df_bootstrap.quantile(.975),
}).reset_index(names="model").sort_values("rating", ascending=False)

df_bootstrap_ci["error_y"] = df_bootstrap_ci["upper"] - df_bootstrap_ci["rating"]
df_bootstrap_ci["error_y_minus"] = df_bootstrap_ci["rating"] - df_bootstrap_ci["lower"]
df_bootstrap_ci["rating_rounded"] = np.round(df_bootstrap_ci["rating"], 2)

df_bootstrap_ci

In [None]:
def visualize_ci(df_ci: pd.DataFrame, title: str | None = None) -> Figure:
    fig = px.scatter(df_ci, x="model", y="rating", error_y="error_y", error_y_minus="error_y_minus", title=title)

    fig.update_layout(xaxis_title="Model", yaxis_title="Score", width=800, height=640, title_x=.5)

    fig.update_traces(hovertemplate="Model: %{x}<br>Score: %{y}")

    return fig

In [None]:
visualize_ci(df_bootstrap_ci.head(30), "Bootstrapped Confidence Intervals for Bradley–Terry Scores")