In [None]:
import os

In [None]:
os.environ['RDS_HOSTNAME']="YOUR_RDS_DB_HOSTNAME"
os.environ['RDS_DB_NAME']="YOUR_RDS_DB_NAME"
os.environ['RDS_USERNAME']="YOUR_RDS_USERNAME"
os.environ['RDS_PASSWORD']="YOUR_RDS_PASSWORD"

In [None]:
EVAL_MODEL = 'gpt-4-0613'

In [41]:
import pprint
from collections import defaultdict
from copy import deepcopy

import numpy as np
import pandas as pd
import plotly.express as px
from pandas import option_context
from tqdm import tqdm

from db.db_utils import get_battles_for_eval_model
from src.parse import db_battle_to_dict

In [42]:
pd.options.display.float_format = "{:.2f}".format
pd.options.display.max_rows = None

pp = pprint.PrettyPrinter(indent=4)

In [44]:
battle_data_raw = get_battles_for_eval_model(eval_model_name=EVAL_MODEL)
print(f"Found {len(battle_data_raw)} battles")
if len(battle_data_raw) > 0:
    print(f"First battle: {battle_data_raw[0]}")

In [45]:
battle_data = [db_battle_to_dict(b) for b in battle_data_raw]
print(f"Parsed {len(battle_data)} battles")
if len(battle_data) > 0:
    print(f"First battle: {battle_data[0]}")

[{'eval_by_model_id': '45c54537-2a6e-4a85-b65b-c271090abb76',
  'model_a': 'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
  'model_b': 'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b',
  'win': 'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
  'score_a': 3.0,
  'score_b': 9.0,
  'other': None,
  'tstamp': 1688595206.061889},
 {'eval_by_model_id': '057b8bb5-fd06-4182-9c07-770923d53729',
  'model_a': 'OpenAssistant/falcon-40b-sft-top1-560',
  'model_b': 'lmsys/vicuna-33b-v1.3',
  'win': 'lmsys/vicuna-33b-v1.3',
  'score_a': 4.0,
  'score_b': 9.0,
  'other': None,
  'tstamp': 1688595222.128842},
 {'eval_by_model_id': '232a9ddb-3307-4385-babd-59c633ef7aa9',
  'model_a': 'h2oai/h2ogpt-research-oasst1-llama-65b',
  'model_b': 'lmsys/vicuna-33b-v1.3',
  'win': 'h2oai/h2ogpt-research-oasst1-llama-65b',
  'score_a': 10.0,
  'score_b': 8.0,
  'other': None,
  'tstamp': 1688595238.379219},
 {'eval_by_model_id': '40bb7237-3abc-4136-9832-3b767cbb663c',
  'model_a': 'gpt-4-0314',
  'model_b': 'h2oai/h

In [47]:
bad_evals = []
for r in battle_data:
    win = None
    if r["win"] is not None:
        if r["win"] == r["model_a"]:
            win = "model_a"
        elif r["win"] == r["model_b"]:
            win = "model_b"
        elif r["win"] == "tie":
            win = "tie"
    else:
        win = "tie"
    if win is None:
        print(f"Unknown win: {r['win']}")
        bad_evals.append(r)
        continue

    r["win"] = win
    r["model_a"] = r["model_a"].split("/")[-1]
    r["model_b"] = r["model_b"].split("/")[-1]

In [48]:
bad_evals

[]

In [49]:
battle_data[:5]

[{'eval_by_model_id': '45c54537-2a6e-4a85-b65b-c271090abb76',
  'model_a': 'h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
  'model_b': 'h2ogpt-gm-oasst1-en-2048-open-llama-13b',
  'win': 'model_a',
  'score_a': 3.0,
  'score_b': 9.0,
  'tstamp': 1688595206.061889},
 {'eval_by_model_id': '057b8bb5-fd06-4182-9c07-770923d53729',
  'model_a': 'falcon-40b-sft-top1-560',
  'model_b': 'vicuna-33b-v1.3',
  'win': 'model_b',
  'score_a': 4.0,
  'score_b': 9.0,
  'tstamp': 1688595222.128842},
 {'eval_by_model_id': '232a9ddb-3307-4385-babd-59c633ef7aa9',
  'model_a': 'h2ogpt-research-oasst1-llama-65b',
  'model_b': 'vicuna-33b-v1.3',
  'win': 'model_a',
  'score_a': 10.0,
  'score_b': 8.0,
  'tstamp': 1688595238.379219},
 {'eval_by_model_id': '40bb7237-3abc-4136-9832-3b767cbb663c',
  'model_a': 'gpt-4-0314',
  'model_b': 'h2ogpt-gm-oasst1-en-2048-falcon-40b-v1',
  'win': 'model_a',
  'score_a': 10.0,
  'score_b': 8.0,
  'tstamp': 1688595251.451777},
 {'eval_by_model_id': 'a42ecdd4-7e57-4b64-8e20-79abdf

In [50]:
battles_raw = pd.DataFrame(battle_data).sort_values(ascending=True, by=["tstamp"])
with option_context("display.max_colwidth", 400):
    display(battles_raw.head(10))

Unnamed: 0,eval_by_model_id,model_a,model_b,win,score_a,score_b,tstamp
0,45c54537-2a6e-4a85-b65b-c271090abb76,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,h2ogpt-gm-oasst1-en-2048-open-llama-13b,model_a,3.0,9.0,1688595206.06
1,057b8bb5-fd06-4182-9c07-770923d53729,falcon-40b-sft-top1-560,vicuna-33b-v1.3,model_b,4.0,9.0,1688595222.13
2,232a9ddb-3307-4385-babd-59c633ef7aa9,h2ogpt-research-oasst1-llama-65b,vicuna-33b-v1.3,model_a,10.0,8.0,1688595238.38
3,40bb7237-3abc-4136-9832-3b767cbb663c,gpt-4-0314,h2ogpt-gm-oasst1-en-2048-falcon-40b-v1,model_a,10.0,8.0,1688595251.45
4,a42ecdd4-7e57-4b64-8e20-79abdf63501b,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,gpt-4-0314,model_b,10.0,8.0,1688595290.16
5,5294088d-728c-4816-970b-9addea73ddbd,h2ogpt-gm-oasst1-en-2048-open-llama-13b,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,model_a,9.0,6.0,1688595320.79
6,08981e78-b502-4bec-a183-ed280cf88905,gpt-3.5-turbo-0613,h2ogpt-research-oasst1-llama-65b,model_b,9.0,8.5,1688595337.11
7,34a66327-ae84-4174-999b-de65d74c3dfc,h2ogpt-gm-oasst1-en-2048-open-llama-13b,gpt-4-0314,model_b,9.0,5.0,1688595355.54
8,3a32c2b4-a633-4a45-90f3-e275c52ca7ea,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,vicuna-33b-v1.3,model_b,8.0,9.0,1688595372.16
9,f27b2d8b-b233-473f-a41d-0637897ff450,h2ogpt-oasst1-falcon-40b,falcon-40b-sft-top1-560,model_a,9.0,8.0,1688595391.54


In [51]:
print(f"{len(battles_raw)} battles")

6300

In [52]:
battles = battles_raw.drop(
    columns=["eval_by_model_id", "score_a", "score_b"], inplace=False
)
with option_context("display.max_colwidth", 400):
    display(battles.head(20))

Unnamed: 0,model_a,model_b,win,tstamp
0,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,h2ogpt-gm-oasst1-en-2048-open-llama-13b,model_a,1688595206.06
1,falcon-40b-sft-top1-560,vicuna-33b-v1.3,model_b,1688595222.13
2,h2ogpt-research-oasst1-llama-65b,vicuna-33b-v1.3,model_a,1688595238.38
3,gpt-4-0314,h2ogpt-gm-oasst1-en-2048-falcon-40b-v1,model_a,1688595251.45
4,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,gpt-4-0314,model_b,1688595290.16
5,h2ogpt-gm-oasst1-en-2048-open-llama-13b,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,model_a,1688595320.79
6,gpt-3.5-turbo-0613,h2ogpt-research-oasst1-llama-65b,model_b,1688595337.11
7,h2ogpt-gm-oasst1-en-2048-open-llama-13b,gpt-4-0314,model_b,1688595355.54
8,h2ogpt-gm-oasst1-en-2048-falcon-7b-v3,vicuna-33b-v1.3,model_b,1688595372.16
9,h2ogpt-oasst1-falcon-40b,falcon-40b-sft-top1-560,model_a,1688595391.54


In [53]:
fig = px.bar(
    battles["win"].value_counts(),
    title="Counts of Battle Outcomes",
    text_auto=True,
    height=400,
)
fig.update_layout(xaxis_title="Battle Outcome", yaxis_title="Count", showlegend=False)
display(fig)

In [54]:
battles_no_ties = battles[~battles["win"].str.contains("tie")]

In [55]:
fig = px.bar(
    pd.concat([battles["model_a"], battles["model_b"]]).value_counts(),
    title="Battle Count for Each Model",
    text_auto=True,
)
fig.update_layout(xaxis_title="model", yaxis_title="Battle Count", showlegend=False)
fig.update_xaxes(tickangle=-90)
display(fig)

In [56]:
fig = px.bar(
    pd.concat([battles_no_ties["model_a"], battles_no_ties["model_b"]]).value_counts(),
    title="Battle Count for Each Model Without Ties",
    text_auto=True,
)
fig.update_layout(xaxis_title="model", yaxis_title="Battle Count", showlegend=False)
fig.update_xaxes(tickangle=-90)
display(fig)

In [57]:
def get_battle_counts(df):
    ptbl = pd.pivot_table(
        df, index="model_a", columns="model_b", aggfunc="size", fill_value=0
    )
    battle_counts = ptbl + ptbl.T
    return battle_counts

In [58]:
battles["model_a"].value_counts()

model_a
falcon-40b-instruct                        437
Llama-2-70b-chat-hf                        434
h2ogpt-oasst1-falcon-40b                   430
gpt-4-0613                                 430
claude-2                                   426
vicuna-33b-v1.3                            424
falcon-40b-sft-top1-560                    423
h2ogpt-gm-oasst1-en-2048-open-llama-13b    421
h2ogpt-gm-oasst1-en-2048-falcon-40b-v2     417
gpt-3.5-turbo-0613                         415
h2ogpt-research-oasst1-llama-65b           412
Llama-2-13b-chat-hf                        410
h2ogpt-gm-oasst1-en-2048-falcon-7b-v3      408
gpt-4-0314                                 407
h2ogpt-gm-oasst1-en-2048-falcon-40b-v1     406
Name: count, dtype: int64

In [59]:
battles["model_b"].value_counts()

model_b
h2ogpt-gm-oasst1-en-2048-falcon-40b-v1     434
gpt-4-0314                                 433
h2ogpt-gm-oasst1-en-2048-falcon-7b-v3      432
Llama-2-13b-chat-hf                        430
h2ogpt-research-oasst1-llama-65b           428
gpt-3.5-turbo-0613                         425
h2ogpt-gm-oasst1-en-2048-falcon-40b-v2     423
h2ogpt-gm-oasst1-en-2048-open-llama-13b    419
falcon-40b-sft-top1-560                    417
vicuna-33b-v1.3                            416
claude-2                                   414
h2ogpt-oasst1-falcon-40b                   410
gpt-4-0613                                 410
Llama-2-70b-chat-hf                        406
falcon-40b-instruct                        403
Name: count, dtype: int64

In [60]:
def visualize_battle_count(df, title):
    cdf = get_battle_counts(df)
    ordering = cdf.sum().sort_values(ascending=False).index
    fig = px.imshow(
        # cdf,
        cdf.loc[ordering, ordering],
        title=title,
        text_auto=True,
        height=1000,
        width=1200,
    )
    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        title_y=0.02,
        title_x=0.5,
    )
    fig.update_xaxes(tickangle=-90)
    fig.update_traces(
        hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>"
    )
    return fig

In [61]:
fig = visualize_battle_count(
    battles, title="Battle Count of Each Combination of Models"
)
display(fig)

In [62]:
visualize_battle_count(
    battles_no_ties, "Battle Count for Each Combination of Models (without Ties)"
)

In [63]:
def compute_pairwise_win_fraction(battles):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles["win"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles["win"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(
        battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0
    )

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (
        num_battles_ptbl + num_battles_ptbl.T
    )

    # Arrange ordering according to proportion of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

In [64]:
def visualize_pairwise_win_fraction(df, title):
    wdf = compute_pairwise_win_fraction(df)
    ordered_models = wdf.mean(axis=1).sort_values(ascending=False).index

    fig = px.imshow(
        # wdf.loc[ordered_models, ordered_models],
        wdf,
        color_continuous_scale="RdBu",
        text_auto=".2f",
        title=title,
        height=1000,
        width=1200,
    )
    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        title_y=0.02,
        title_x=0.5,
    )
    fig.update_xaxes(tickangle=-90)
    fig.update_traces(
        hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>"
    )

    return fig

In [65]:
fig = visualize_pairwise_win_fraction(
    battles_no_ties, title="Fraction of Model A Wins for All Non-tied A vs. B Battles"
)
display(fig)

In [66]:
row_beats_col_freq = compute_pairwise_win_fraction(battles_no_ties)

In [67]:
fig = px.bar(
    row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
    title="Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
    text_auto=".2f",
)
fig.update_layout(
    yaxis_title="Average Win Rate", xaxis_title="Model", showlegend=False, height=700
)
fig.update_xaxes(tickangle=-90)
display(fig)

In [68]:
def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, win in battles[
        ["model_a", "model_b", "win"]
    ].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_a":
            sa = 1
        elif win == "model_b":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [69]:
def pretty_print_elo_ratings(ratings):
    df = (
        pd.DataFrame(
            [[n, ratings[n]] for n in ratings.keys()],
            columns=["Model", "Elo rating"],
        )
        .sort_values("Elo rating", ascending=False)
        .reset_index(drop=True)
    )
    df["Elo rating"] = df["Elo rating"].astype(int)
    df.index = df.index + 1
    return df


elo_ratings = compute_elo(battles)

with option_context("display.max_colwidth", 400):
    display(pretty_print_elo_ratings(elo_ratings))

Unnamed: 0,Model,Elo rating
1,gpt-4-0314,1310
2,Llama-2-70b-chat-hf,1191
3,gpt-4-0613,1171
4,Llama-2-13b-chat-hf,1140
5,claude-2,1136
6,gpt-3.5-turbo-0613,1130
7,h2ogpt-gm-oasst1-en-2048-falcon-40b-v1,986
8,vicuna-33b-v1.3,962
9,h2ogpt-gm-oasst1-en-2048-falcon-40b-v2,962
10,falcon-40b-sft-top1-560,903


|    | Model                                   |   Elo rating |
|---:|:----------------------------------------|-------------:|
|  1 | gpt-4-0314                              |         1310 |
|  2 | Llama-2-70b-chat-hf                     |         1191 |
|  3 | gpt-4-0613                              |         1171 |
|  4 | Llama-2-13b-chat-hf                     |         1140 |
|  5 | claude-2                                |         1136 |
|  6 | gpt-3.5-turbo-0613                      |         1130 |
|  7 | h2ogpt-gm-oasst1-en-2048-falcon-40b-v1  |          986 |
|  8 | vicuna-33b-v1.3                         |          962 |
|  9 | h2ogpt-gm-oasst1-en-2048-falcon-40b-v2  |          962 |
| 10 | falcon-40b-sft-top1-560                 |          903 |
| 11 | h2ogpt-research-oasst1-llama-65b        |          876 |
| 12 | h2ogpt-oasst1-falcon-40b                |          847 |
| 13 | h2ogpt-gm-oasst1-en-2048-falcon-7b-v3   |          803 |
| 14 | h2ogpt-gm-oasst1-en-2048-open-lla

In [70]:
def predict_win_rate(elo_ratings, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea

    data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T

In [71]:
win_rate = predict_win_rate(compute_elo(battles))

In [72]:
ordered_models = win_rate.mean(axis=1).sort_values(ascending=False).index

fig = px.imshow(
    win_rate.loc[ordered_models, ordered_models],
    color_continuous_scale="RdBu",
    text_auto=".2f",
    title="Predicted Win Rate Using Elo Ratings for Model A in an A vs. B Battle",
    height=1000,
    width=1200,
)
fig.update_layout(
    xaxis_title="Model B",
    yaxis_title="Model A",
    xaxis_side="top",
    title_y=0.02,
    title_x=0.5,
)
fig.update_xaxes(tickangle=-90)
fig.update_traces(
    hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Win Rate: %{z}<extra></extra>"
)
display(fig)

In [73]:
BOOTSTRAP_ROUNDS = 1000

In [74]:
def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [75]:
def visualize_bootstrap_scores(df, title):
    bars = (
        pd.DataFrame(
            dict(
                lower=df.quantile(0.025),
                rating=df.quantile(0.5),
                upper=df.quantile(0.975),
            )
        )
        .reset_index(names="model")
        .sort_values("rating", ascending=False)
    )
    bars["error_y"] = bars["upper"] - bars["rating"]
    bars["error_y_minus"] = bars["rating"] - bars["lower"]
    bars["rating_rounded"] = np.round(bars["rating"], 2)
    fig = px.scatter(
        bars,
        x="model",
        y="rating",
        error_y="error_y",
        error_y_minus="error_y_minus",
        text="rating_rounded",
        title=title,
        height=900,
    )
    fig.update_xaxes(tickangle=-90)
    fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
    return fig

In [76]:
def make_pretty(df, col_name):
    df[col_name] = df[col_name].astype(int)
    df.index = df.index + 1
    return df

In [77]:
bootstrap_elo_lu = get_bootstrap_result(battles, compute_elo, BOOTSTRAP_ROUNDS)
bootstrap_lu_median = (
    bootstrap_elo_lu.median().reset_index().set_axis(["model", "rating"], axis=1)
)
bootstrap_lu_median = make_pretty(bootstrap_lu_median, "rating")

with option_context("display.max_colwidth", 400):
    display(bootstrap_lu_median)
print(bootstrap_lu_median.to_markdown())

bootstrap: 100%|██████████| 1000/1000 [00:14<00:00, 67.82it/s]


Unnamed: 0,model,rating
1,gpt-4-0314,1340
2,Llama-2-70b-chat-hf,1182
3,gpt-4-0613,1171
4,Llama-2-13b-chat-hf,1154
5,claude-2,1129
6,gpt-3.5-turbo-0613,1118
7,h2ogpt-gm-oasst1-en-2048-falcon-40b-v1,1008
8,vicuna-33b-v1.3,959
9,h2ogpt-gm-oasst1-en-2048-falcon-40b-v2,958
10,falcon-40b-sft-top1-560,880


|    | model                                   |   rating |
|---:|:----------------------------------------|---------:|
|  1 | gpt-4-0314                              |     1340 |
|  2 | Llama-2-70b-chat-hf                     |     1182 |
|  3 | gpt-4-0613                              |     1171 |
|  4 | Llama-2-13b-chat-hf                     |     1154 |
|  5 | claude-2                                |     1129 |
|  6 | gpt-3.5-turbo-0613                      |     1118 |
|  7 | h2ogpt-gm-oasst1-en-2048-falcon-40b-v1  |     1008 |
|  8 | vicuna-33b-v1.3                         |      959 |
|  9 | h2ogpt-gm-oasst1-en-2048-falcon-40b-v2  |      958 |
| 10 | falcon-40b-sft-top1-560                 |      880 |
| 11 | h2ogpt-research-oasst1-llama-65b        |      871 |
| 12 | h2ogpt-oasst1-falcon-40b                |      858 |
| 13 | h2ogpt-gm-oasst1-en-2048-open-llama-13b |      809 |
| 14 | h2ogpt-gm-oasst1-en-2048-falcon-7b-v3   |      780 |
| 15 | falcon-40b-instruct              

In [78]:
fig = visualize_bootstrap_scores(bootstrap_elo_lu, "Bootstrap of Elo Estimates")
display(fig)

In [79]:
fig = px.violin(bootstrap_elo_lu.melt(), x="variable", y="value", height=900).update_xaxes(
    tickangle=-90
)
display(fig)