In [1]:
import pandas as pd
import numpy as np
import ast

df = pd.read_csv("../data/raw/lmsys/chatbot_arena_kaggle2024_train.csv")

# get sum of all appearances of each model
vc_a = df["model_a"].value_counts()
vc_b = df["model_b"].value_counts()
combined_vc = pd.concat([vc_a, vc_b]).groupby(level=0).sum()
combined_vc = combined_vc.sort_values(ascending=False)
print("Model counts (appearances as either model A or B):")
pd.set_option('display.max_rows', 20)
print(combined_vc)
pd.reset_option('display.max_rows')

# select datapoints with subset of models
model_list = ['gpt-4-1106-preview', 'gpt-3.5-turbo-0613', 'claude-2.1', 'gpt-4-0314', 'llama-2-70b-chat', 'llama-2-13b-chat']
df = df[df['model_a'].isin(model_list) & df['model_b'].isin(model_list)]

# remove ties
df = df[df['winner_tie'] == 0]

# get number of prompts and responses per model
def convert_to_list(list_str):
    try:
        return ast.literal_eval(list_str)
    except:
        print("Error parsing:", list_str)
        return []

for col in ["prompt", "response_a", "response_b"]:
    df[col] = df[col].apply(convert_to_list)
    df[col + "_length"] = df[col].apply(len)
    df[col] = df[col].apply(lambda x: x[0] if len(x) > 0 else None)

# filter out row with either prompt/response_a/response_b being len more than 1
df = df[(df["prompt_length"] == 1) & (df["response_a_length"] == 1) & (df["response_b_length"] == 1)]

# add winner_model and loser_model columns
df['winner_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 1 else x['model_b'], axis=1)
df['loser_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 0 else x['model_b'], axis=1)

# check if each col of prompt, response_a, response_b is encodable in utf-8
# drop rows that are not encodable
for col in ["prompt", "response_a", "response_b"]:
    for i, row in df.iterrows():
        try:
            row[col].encode("utf-8")
        except:
            df.drop(i, inplace=True)


Model counts (appearances as either model A or B):
gpt-4-1106-preview          7387
gpt-3.5-turbo-0613          7083
gpt-4-0613                  6165
claude-2.1                  5583
claude-instant-1            4136
                            ... 
falcon-180b-chat             286
openchat-3.5-0106            244
qwen1.5-7b-chat              208
qwen1.5-4b-chat              200
mistral-7b-instruct-v0.2     100
Name: count, Length: 64, dtype: int64
Error parsing: ["Sure, let me tell you a story. Once upon a time, there was a little girl named Sophie who loved animals. Her favorite was her pet bunny named Fluffy. One day, Sophie was playing in the backyard when she noticed a baby bird that had fallen out of its nest. The poor bird was chirping loudly, looking scared and alone. Sophie carefully picked it up and placed it back in the nest. She felt happy that she could help the baby bird. The next day when Sophie went to play outside again, she noticed the mama bird flying around the yard 

In [2]:
df["winner_model"].value_counts()

winner_model
gpt-4-1106-preview    1104
gpt-4-0314             350
claude-2.1             344
gpt-3.5-turbo-0613     265
llama-2-70b-chat       119
llama-2-13b-chat        69
Name: count, dtype: int64

In [3]:
df["loser_model"].value_counts()

loser_model
claude-2.1            795
gpt-3.5-turbo-0613    583
gpt-4-1106-preview    320
gpt-4-0314            298
llama-2-13b-chat      128
llama-2-70b-chat      127
Name: count, dtype: int64

In [4]:
len(df)

2251

In [5]:
# subselect 2k samples
# set random seed
df_final = df.sample(2000, random_state=42)


In [6]:
import ast

#df_final = df_final.rename(columns={"response_a": "text_a", "response_b": "text_b"})
df_final["text_a"] = df_final["response_a"]
df_final["text_b"] = df_final["response_b"]
df_final["preferred_text"] = df_final.apply(lambda x: "text_a" if x["winner_model_a"] == 1 else "text_b", axis=1)

# format into correct text
# prepend instruction column to both text_a and text_b
for col in ["text_a", "text_b"]:
    df_final[col] = "Instruction:\n" + df_final["prompt"] + "\n\n\nAssistant:\n" + df_final[col]

# shuffle
df_final: pd.DataFrame = df_final.sample(frac=1, random_state=42)

df_final.to_csv("../data/processed/lmsys/chatbot_arena_kaggle2024_train_2krandom.csv", index=False)