In [1]:
import argparse
import os
import time
import requests
import numpy as np
import pandas as pd
from functools import partial
from original import get_bootstrap_result as og_bootstrap, compute_mle_elo as og_mle
from faster import get_bootstrap_result as fast_bootstrap, compute_mle_elo as fast_mle

In [2]:
# load and filter the data, same logic as from the original notebook
# https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=EZvUIOhVZD27
url = "https://storage.googleapis.com/arena_external_data/public/clean_battle_20240814_public.json"
response = requests.get(url)

with open('local_file_name.json', 'wb') as file:
    file.write(response.content)

# load the JSON data from the local file
with open('local_file_name.json', 'r') as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])

# we use anony battles only for leaderboard
battles = battles[battles["anony"] == True]

# we de-duplicate top 0.1% redudant prompts
# see https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication
print("Before dedup: ", len(battles))
battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]
print("After dedup: ", len(battles))

Before dedup:  1799991
After dedup:  1670250


In [3]:
# First compare MLE directly with MLE
original_ratings = og_mle(battles)

In [4]:
# ironically this is slower than the original
# this is because it does some one-time preprocessing to make all the subsequent bootstrap calls **much** faster
fast_ratings = fast_mle(battles)

In [5]:
mean_abs_rating_diff = np.mean(np.abs(original_ratings.values - fast_ratings.values))
print(f'mean absolute difference: {mean_abs_rating_diff}')

mean absolute difference: 7.167734795018973e-11


In [6]:
# next look at the bootstrap
original_bootstrap_ratings = og_bootstrap(battles, og_mle, 100)

bootstrap:   0%|          | 0/100 [00:00<?, ?it/s]

bootstrap: 100%|██████████| 100/100 [19:43<00:00, 11.83s/it]


In [7]:
fast_bootstrap_ratings = fast_bootstrap(battles, 100)

In [8]:
og_bootstrap_mean = original_bootstrap_ratings.values.mean(axis=0)
og_bootstrap_std = original_bootstrap_ratings.values.std(axis=0)
# they are sorted by median they could be in different order due to randomness
fast_bootstrap_mean = fast_bootstrap_ratings[original_bootstrap_ratings.columns].values.mean(axis=0)
fast_bootstrap_std = fast_bootstrap_ratings[original_bootstrap_ratings.columns].values.std(axis=0)

print(f'mean abs diff of means: {np.mean(np.abs(og_bootstrap_mean - fast_bootstrap_mean))}')
print(f'mean abs diff of stds: {np.mean(np.abs(og_bootstrap_std - fast_bootstrap_std))}')

mean abs diff of means: 0.3105859687304655
mean abs diff of stds: 0.24714694090303316


In [9]:
# check if the difference is smaller with more samples
original_bootstrap_ratings2 = og_bootstrap(battles, og_mle, 200)
fast_bootstrap_ratings2 = fast_bootstrap(battles, 200)

bootstrap:   0%|          | 0/200 [00:00<?, ?it/s]

bootstrap: 100%|██████████| 200/200 [35:36<00:00, 10.68s/it]


In [10]:
og_bootstrap_mean2 = original_bootstrap_ratings2.values.mean(axis=0)
og_bootstrap_std2 = original_bootstrap_ratings2.values.std(axis=0)
# they are sorted by median they could be in different order due to randomness
fast_bootstrap_mean2 = fast_bootstrap_ratings2[original_bootstrap_ratings2.columns].values.mean(axis=0)
fast_bootstrap_std2 = fast_bootstrap_ratings2[original_bootstrap_ratings2.columns].values.std(axis=0)

print(f'mean abs diff of means: {np.mean(np.abs(og_bootstrap_mean2 - fast_bootstrap_mean2))}')
print(f'mean abs diff of stds: {np.mean(np.abs(og_bootstrap_std2 - fast_bootstrap_std2))}')

mean abs diff of means: 0.18536835223940057
mean abs diff of stds: 0.18313215115635942
