In [None]:
import pandas as pd
import numpy as np
import ast

df = pd.read_csv("../data/raw/lmsys/chatbot_arena_kaggle2024_train.csv")

print(f"Original dataset size: {len(df)}")

# get sum of all appearances of each model
vc_a = df["model_a"].value_counts()
vc_b = df["model_b"].value_counts()
combined_vc = pd.concat([vc_a, vc_b]).groupby(level=0).sum()
combined_vc = combined_vc.sort_values(ascending=False)
print("Model counts (appearances as either model A or B):")
pd.set_option('display.max_rows', None)
print(combined_vc)
pd.reset_option('display.max_rows')

# select datapoints with subset of models
model_list = ['gpt-4-1106-preview', 'gpt-3.5-turbo-0613', 'claude-2.1', 'claude-2.0','gpt-4-0314', 'llama-2-70b-chat', 'llama-2-13b-chat', 'llama-2-7b-chat', 'mixtral-8x7b-instruct-v0.1', "gpt-4-0613"]
df = df[df['model_a'].isin(model_list) & df['model_b'].isin(model_list)]

# remove ties
df = df[df['winner_tie'] == 0]

# get number of prompts and responses per model
def convert_to_list(list_str):
    try:
        return ast.literal_eval(list_str)
    except:
        print("Error parsing:", list_str)
        return []

for col in ["prompt", "response_a", "response_b"]:
    df[col] = df[col].apply(convert_to_list)
    df[col + "_length"] = df[col].apply(len)
    df[col] = df[col].apply(lambda x: x[0] if len(x) > 0 else None)

# filter out row with either prompt/response_a/response_b being len more than 1
df = df[(df["prompt_length"] == 1) & (df["response_a_length"] == 1) & (df["response_b_length"] == 1)]

# add winner_model and loser_model columns
df['winner_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 1 else x['model_b'], axis=1)
df['loser_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 0 else x['model_b'], axis=1)

# check if each col of prompt, response_a, response_b is encodable in utf-8
# drop rows that are not encodable
for col in ["prompt", "response_a", "response_b"]:
    for i, row in df.iterrows():
        try:
            row[col].encode("utf-8")
        except:
            df.drop(i, inplace=True)


In [None]:
df["winner_model"].value_counts()

In [None]:
df["loser_model"].value_counts()

In [None]:
len(df)

In [5]:
# subselect 2k samples
# set random seed
#df_final = df.sample(2000, random_state=42)
df_final = df


In [6]:
import ast

#df_final = df_final.rename(columns={"response_a": "text_a", "response_b": "text_b"})
df_final["text_a"] = df_final["response_a"]
df_final["text_b"] = df_final["response_b"]
df_final["preferred_text"] = df_final.apply(lambda x: "text_a" if x["winner_model_a"] == 1 else "text_b", axis=1)

# format into correct text
# prepend instruction column to both text_a and text_b
for col in ["text_a", "text_b"]:
    df_final[col] = "Instruction:\n" + df_final["prompt"] + "\n\n\nAssistant:\n" + df_final[col]

# shuffle
df_final: pd.DataFrame = df_final.sample(frac=1, random_state=42)

df_final.to_csv("../data/processed/lmsys/chatbot_arena_kaggle2024_train_5615random_v2.csv", index=False)