In [None]:
%pip install -U plotly kaleido

In [1]:
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
bench_single = "data/mn_vicuna_bench/model_judgment/gpt-4o_single.jsonl"
bench_pair = "data/mn_vicuna_bench/model_judgment/gpt-4o_pair.jsonl"
# bench_single = "data/mn_mt_bench/model_judgment/gpt-4o_single.jsonl"
# bench_pair = "data/mn_mt_bench/model_judgment/gpt-4o_pair.jsonl"

def get_model_df():
    cnt = 0
    q2result = []
    fin = open(bench_single, "r")
    for line in fin:
        obj = json.loads(line)
        obj["category"] = CATEGORIES[(obj["question_id"]-81)//10]
        q2result.append(obj)
    df = pd.DataFrame(q2result)
    return df

def toggle(res_str):
    if res_str == "win":
        return "loss"
    elif res_str == "loss":
        return "win"
    return "tie"

def get_model_df_pair():
    fin = open(bench_pair, "r")
    cnt = 0
    q2result = []
    for line in fin:
        obj = json.loads(line)

        result = {}
        result["qid"] = str(obj["question_id"])
        result["turn"] = str(obj["turn"])
        if obj["g1_winner"] == "model_1" and obj["g2_winner"] == "model_1":
            result["result"] = "win"
        elif obj["g1_winner"] == "model_2" and obj["g2_winner"] == "model_2":
            result["result"] = "loss"
        else:
            result["result"] = "tie"
        result["category"] = CATEGORIES[(obj["question_id"]-81)//10]
        result["model"] = obj["model_1"]
        q2result.append(result)

    df = pd.DataFrame(q2result)

    return df

df = get_model_df()
df_pair = get_model_df_pair()

In [None]:
df_pair

In [2]:
all_models = df["model"].unique()
print(all_models)
scores_all = []
for model in all_models:
    for cat in CATEGORIES:
        # filter category/model, and score format error (<1% case)
        res = df[(df["category"]==cat) & (df["model"]==model) & (df["score"] >= 0)]
        score = res["score"].mean()

        # # pairwise result
        # res_pair = df_pair[(df_pair["category"]==cat) & (df_pair["model"]==model)]["result"].value_counts()
        # wincnt = res_pair["win"] if "win" in res_pair.index else 0
        # tiecnt = res_pair["tie"] if "tie" in res_pair.index else 0
        # winrate = wincnt/res_pair.sum()
        # winrate_adjusted = (wincnt + tiecnt)/res_pair.sum()
        # # print(winrate_adjusted)

        # scores_all.append({"model": model, "category": cat, "score": score, "winrate": winrate, "wtrate": winrate_adjusted})
        scores_all.append({"model": model, "category": cat, "score": score})

['gpt-4' 'llama3.1-8B-instruct' 'gpt-3.5-turbo' 'mistral-7b-instruct-v0.3'
 'llama2-13B' 'mala-500-10b-v2' 'gemma-2-9b-it'
 'gemma-2-9b-alpaca-mongolian' 'Qwen2.5-7B-Instruct'
 'gemma-2-9b-alpaca-mongolian-5000' 'gemma-2-9b-it-evol-1000'
 'gemma2-9b-it-alpaca-mn' 'gemma-2-9b-alpaca-mn-500s' 'emma-500-llama2-7b'
 'Llama-3.1-8B-Alpaca-MN-SFT-v0.1' 'gemma-2-9b-Alpaca-MN-SFT-v0.1'
 'gpt-4o-mini' 'gpt-4-turbo' 'gemma-2-27b-it']


In [8]:
target_models = ["gpt-4o-mini",
                 "llama3.1-8B-instruct", "gemma-2-9b-it",
                #  "emma-500-llama2-7b"
                 "Llama-3.1-8B-Alpaca-MN-SFT-v0.1", "gemma-2-9b-Alpaca-MN-SFT-v0.1",
                #  "emma-500-llama2-7b", "Qwen2.5-7B-Instruct",
                 ]

scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]

# sort by target_models
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)

df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]

rename_map = {"llama-13b": "LLaMA-13B",
              "alpaca-13b": "Alpaca-13B",
              "vicuna-33b-v1.3": "Vicuna-33B",
              "vicuna-13b-v1.3": "Vicuna-13B",
              "gpt-3.5-turbo": "GPT-3.5-turbo",
              "claude-v1": "Claude-v1",
              "gpt-4": "GPT-4",
              "gpt-4-turbo": "GPT-4-Turbo",
              "gpt-4o-mini": "GPT-4o-mini",
              "gemma-2-9b-it": "Gemma-2-9B-IT",
              "gemma-2-27b-it": "Gemma-2-27B-IT",
              "gemma-2-9b-Alpaca-MN-SFT-v0.1": "Gemma-2-9b-Alpaca-MN",
              "Llama-3.1-8B-Instruct": "Llama-3.1-8B-Instruct",
              "Llama-3.1-8B-Alpaca-MN-SFT-v0.1": "Llama-3.1-8B-Alpaca-MN", 
              "llama2-13B": "Llama2-13B-Chat",
               "llama3.1-8B-instruct": "Llama3.1-8B-Instruct",
              "mistral-7b-instruct-v0.3": "Mistral-7B-Instruct-v0.3",
              "mala-500-10b-v2": "MaLA-500-10B-v2",
              "Qwen2.5-7B-Instruct": "Qwen2.5-7B-Instruct",
              "gemma-2-9b-alpaca-mongolian": "Gemma-2-9B-SFT",
              "emma-500-llama2-7b": "EMMA-500-llama2-7b",
              }


for k, v in rename_map.items():
    df_score.replace(k, v, inplace=True)

fig = px.line_polar(df_score, r = 'score', theta = 'category', line_close = True, category_orders = {"category": CATEGORIES},
                    color = 'model', markers=True, color_discrete_sequence=px.colors.qualitative.Pastel)

fig.show()

In [None]:
# fig = px.line_polar(df_score, r = 'wtrate', theta = 'category', line_close = True, category_orders = {"category": CATEGORIES},
#                     color = 'model', markers=True, color_discrete_sequence=px.colors.qualitative.Pastel)
# fig.show()

In [9]:
fig.update_layout(
    font=dict(
        size=18,
    ),
)
fig.write_image("vicuna-sft.png", width=800, height=600, scale=2)