In [26]:
import gdown
import json
import math
import numpy as np
import os
import pandas as pd
import tempfile

def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
    from sklearn.linear_model import LogisticRegression
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]

    X = np.zeros([n, p])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx)//2:] = False
    Y[tie_idx] = 1.0

    lr = LogisticRegression(fit_intercept=False)
    lr.fit(X,Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING

    # calibrate llama-13b to 800
    elo_scores += (800-elo_scores[models["llama-13b"]])
    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)

def preety_print_model_ratings(ratings):
    df = pd.DataFrame([
        [n, ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

default_repo_dir = os.path.join(tempfile.gettempdir(), "chatbot-arena-leaderboard")
file = os.path.join(default_repo_dir, "chatbot_arena_raw_results.json")
summaries_file = os.path.join(default_repo_dir, "chatbot_arena_summary_results.json")

if not os.path.exists(default_repo_dir):
    url = "https://drive.google.com/file/d/1jjJ8k3L-BzFKSevoGo6yaJ-jCjc2SCK1/view"
    gfile = gdown.download(url, quiet=False, fuzzy=True)
    os.makedirs(default_repo_dir)
    os.rename(gfile, file)
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])
    elo_mle_ratings = compute_mle_elo(battles)
    with open(summaries_file, "w") as f:
        summary = elo_mle_ratings.to_dict()
        json.dump(summary, f, indent=2, sort_keys=True)

with open(summaries_file) as f:
    summaries = json.load(f)
print(summaries)

{'RWKV-4-Raven-14B': 924.2317268388713, 'alpaca-13b': 903.7413141852036, 'chatglm-6b': 882.4642096846417, 'chatglm2-6b': 928.8566272430447, 'chatglm3-6b': 959.9758249227476, 'claude-1': 1149.3793408751726, 'claude-2.0': 1130.6167042119255, 'claude-2.1': 1118.77207689072, 'claude-instant-1': 1109.3800309435396, 'codellama-34b-instruct': 1041.413339611721, 'dolly-v2-12b': 822.3539115935789, 'dolphin-2.2.1-mistral-7b': 1064.7688756493415, 'falcon-180b-chat': 1031.1130323099937, 'fastchat-t5-3b': 873.504919860924, 'gemini-pro': 1113.7702079441667, 'gemini-pro-dev-api': 1120.446709756838, 'gpt-3.5-turbo-0314': 1104.4732058401285, 'gpt-3.5-turbo-0613': 1116.1678581005403, 'gpt-3.5-turbo-1106': 1072.3127692943162, 'gpt-4-0314': 1190.4987391795019, 'gpt-4-0613': 1159.625234070882, 'gpt-4-turbo': 1249.3346609192497, 'gpt4all-13b-snoozy': 936.7062953652393, 'guanaco-33b': 1030.9219002800637, 'koala-13b': 965.4680021248472, 'llama-13b': 800.0, 'llama-2-13b-chat': 1036.726951506475, 'llama-2-70b-c

In [None]:
import datetime
datetime.datetime.now()

In [80]:
# git clone git@hf.co:spaces/lmsys/chatbot-arena-leaderboard
# git clone https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard
import git
import os
hf_repo = "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard"
repo_dir = os.path.join(tempfile.gettempdir(), "chatbot-arena-leaderboard")

if os.path.exists(repo_dir):
    git.Repo(repo_dir).remotes.origin.pull()
else:
    git.Repo.clone_from(hf_repo, repo_dir)

import datetime
latest_leaderboard = ("null", datetime.datetime(1970, 1, 1))
latest_elo = ("null", datetime.datetime(1970, 1, 1))
def latest(maybe, last):
    return maybe if maybe[1] > last[1] else last

for name in os.listdir(repo_dir):
    splt = name.split("_")
    kind = splt[0]
    if kind == "leaderboard" or kind == "elo":
        date, _ = splt[2].split(".")
        year = int(date[:4])
        month = int(date[4:6])
        day = int(date[6:8])
        dt = datetime.datetime(year, month, day)
        maybe = (name, dt)

        if kind == "leaderboard":
            latest_leaderboard = latest(maybe, latest_leaderboard)
        elif kind == "elo":
            latest_elo = latest(maybe, latest_elo)

leaderboard_file = os.path.join(repo_dir, latest_leaderboard[0])
print(leaderboard_file)
with open(leaderboard_file) as f:
    print(f.readline().rstrip())
    for line in f:
        line = line.rstrip()
        key,model,score,mmlu,date,license_,org,link = line.split(",")
        # print(key,model,score,mmlu,date,license_,org,link)

elo_file = os.path.join(repo_dir, latest_elo[0])
print(elo_file)
import pickle
with open(elo_file, "rb") as f:
    data = pickle.load(f)
    # print(data["rating_system"]) -- bt? whatever
    print(sorted(data["elo_rating_online"].items(), key=lambda x: x[1], reverse=True))
    print(data["elo_rating_final"].to_list())
    for k,v in data.items():
        if k not in ["rating_system"]:
            print(k)

/tmp/chatbot-arena-leaderboard/leaderboard_table_20240202.csv
key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
/tmp/chatbot-arena-leaderboard/elo_results_20240202.pkl
[('gpt-4-0125-preview', 1185.0242535070245), ('gpt-4-1106-preview', 1174.2202487750335), ('bard-jan-24-gemini-pro', 1157.7501884490775), ('gpt-4-0314', 1119.8450053650527), ('gpt-4-0613', 1114.8661181537755), ('gpt-3.5-turbo-0314', 1096.2755359505593), ('mistral-medium', 1092.3366290152812), ('claude-2.0', 1080.842239007839), ('gemini-pro-dev-api', 1059.1902937142418), ('claude-1', 1058.1615194287863), ('claude-2.1', 1056.7886202746788), ('gemini-pro', 1056.3051100624186), ('yi-34b-chat', 1043.816203851759), ('llama-2-70b-chat', 1034.2985767808748), ('gpt-3.5-turbo-0613', 1031.481077044045), ('starling-lm-7b-alpha', 1031.0682027163396), ('tulu-2-dpo-70b', 1029.4041534728487), ('dolphin-2.2.1-mistral-7b', 1028.7071580781405), ('guanaco-33b', 1028.5048373390007), ('llama2-70b-steerlm-chat', 10

usage: ipykernel_launcher.py [-h] [--share]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/alxmke/.local/share/jupyter/runtime/kernel-v2-1746iZVmGvTWowae.json


SystemExit: 2