In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

In [69]:
def compute_mle_elo(
    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
):
    from sklearn.linear_model import LogisticRegression
    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # if no tie, create a zero matrix
    if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"].isin(["tie", "tie (bothbad)"])],
            index="model_a",
            columns="model_b",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # if nan skip
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    if "mixtral-8x7b-instruct-v0.1" in models.index:
        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

In [70]:
def preety_print_model_ratings(ratings):
    df = pd.DataFrame([
        [n, ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    # df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

In [77]:
import json

with open("data/vh.json", "r") as file:
    vh = json.load(file)

with open("voting_output/gpt-4o-2024-05-13_omni_bt_diff_acc_1.0_prob_dec_1.0_vote_num_4000.json", "r") as file:
    rigged = json.load(file)

init_df = pd.DataFrame(list(vh.values()))

modified_votes = list(vh.values()) + list(rigged.values())
votes_df = pd.DataFrame(modified_votes)

In [78]:
mle_elo_init = compute_mle_elo(init_df)
mle_elo_rigged = compute_mle_elo(votes_df)

In [79]:
init_ranking_df = preety_print_model_ratings(mle_elo_init)
init_ranking_df

Unnamed: 0,Model,Elo rating
1,chatgpt-4o-latest,1314.763423
2,gemini-1.5-pro-exp-0801,1297.837094
3,gpt-4o-2024-05-13,1286.336146
4,gpt-4o-mini-2024-07-18,1275.656203
5,claude-3-5-sonnet-20240620,1271.322521
...,...,...
125,chatglm-6b,880.759889
126,fastchat-t5-3b,870.003972
127,stablelm-tuned-alpha-7b,841.643041
128,dolly-v2-12b,819.238412


In [80]:
init_ranking_df.iloc[2]["Elo rating"] - init_ranking_df.iloc[1]["Elo rating"]

-11.500947820867623

In [81]:
rigged_ranking_df = preety_print_model_ratings(mle_elo_rigged)
rigged_ranking_df

Unnamed: 0,Model,Elo rating
1,chatgpt-4o-latest,1309.719766
2,gemini-1.5-pro-exp-0801,1293.676252
3,gpt-4o-2024-05-13,1285.098494
4,gpt-4o-mini-2024-07-18,1271.831831
5,claude-3-5-sonnet-20240620,1269.305333
...,...,...
125,chatglm-6b,885.953899
126,fastchat-t5-3b,874.594598
127,stablelm-tuned-alpha-7b,848.266695
128,dolly-v2-12b,824.870885


In [83]:
(rigged_ranking_df.iloc[2]["Elo rating"] - rigged_ranking_df.iloc[1]["Elo rating"]) - (init_ranking_df.iloc[2]["Elo rating"] - init_ranking_df.iloc[1]["Elo rating"])

2.9231896281485206

In [88]:
sum((init_df["model_a"] == "gpt-4o-2024-05-13") | (init_df["model_b"] == "gpt-4o-2024-05-13"))

69756

In [89]:
sum((init_df["model_a"] == "llama-2-13b-chat") | (init_df["model_b"] == "llama-2-13b-chat"))

17731

In [91]:
sum((init_df["model_a"] == "gemini-1.5-pro-exp-0801") | (init_df["model_b"] == "gemini-1.5-pro-exp-0801"))

18048

In [93]:
sum((init_df["model_a"] == "chatgpt-4o-latest") | (init_df["model_b"] == "chatgpt-4o-latest"))

13079