In [3]:
import pandas as pd
import json
from datasets import load_dataset, concatenate_datasets

In [4]:
# load raw datasets

raw_dict = {}

raw_dict["lmsys"] = load_dataset("lmsys/lmsys-chat-1m")
raw_dict["wildchat"] = load_dataset("allenai/WildChat")
raw_dict["hhonline"] = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-online")
raw_dict["sharegpt"] = load_dataset("liyucheng/ShareGPT90K")

with open('./conversations.jsonl') as f:
    data = f.readlines()
data = [json.loads(x) for x in data]
raw_dict["prism"] = pd.DataFrame(data)

Downloading data: 100%|██████████| 247M/247M [00:29<00:00, 8.30MB/s] 
Downloading data: 100%|██████████| 250M/250M [00:34<00:00, 7.23MB/s] 
Downloading data: 100%|██████████| 247M/247M [00:32<00:00, 7.70MB/s] 
Downloading data:  13%|█▎        | 31.5M/246M [00:04<00:27, 7.74MB/s]

KeyboardInterrupt: 

In [None]:
# get prompts from each dataset

def process_lmsys(dataset):

    df = dataset.to_pandas()
    df["user_prompt"] = df.conversation.apply(lambda x: x[0]["content"])

    # create id column combining "lmsys-" and the index
    df["id"] = "lmsys-" + df.index.astype(str)

    return df[["id", "user_prompt", "language", "redacted"]]


raw_dict["lmsys"] = process_lmsys(raw_dict["lmsys"]["train"])


def process_wildchat(dataset):

    df = dataset.to_pandas()
    df["user_prompt"] = df.conversation.apply(lambda x: x[0]["content"])

    # create id column combining "wildchat-" and the index
    df["id"] = "wildchat-" + df.index.astype(str)

    return df[["id", "user_prompt", "language", "redacted"]]

raw_dict["wildchat"] = process_wildchat(raw_dict["wildchat"]["train"])


def process_hhonline(dataset):

    df = dataset.to_pandas()
    df["user_prompt"] = df.chosen.apply(lambda x: x.split("\n\n")[1].replace("Human: ", ""))

    # create id column combining "hhonline-" and the index
    df["id"] = "hhonline-" + df.index.astype(str)

    return df[["id", "user_prompt"]]

raw_dict["hhonline"] = process_hhonline(concatenate_datasets([raw_dict["hhonline"]["train"], raw_dict["hhonline"]["test"]]))


def process_sharegpt(dataset):

    df = dataset.to_pandas()
    df["user_prompt"] = df.conversations.apply(lambda x: x["value"][0])

    # create id column combining "lmsys-" and the index
    df["id"] = "sharegpt-" + df.index.astype(str)

    return df[["id", "user_prompt"]]

raw_dict["sharegpt"] = process_sharegpt(raw_dict["sharegpt"]["train"])


def process_prism(dataset):

    df = dataset

    # rename "opening_prompt" to "user_prompt"
    df = df.rename(columns={"opening_prompt": "user_prompt"})

    # create id column combining "prism-" and the index
    df["id"] = "prism-" + df.index.astype(str)

    return df[["id", "user_prompt", "conversation_type"]]

raw_dict["prism"] = process_prism(raw_dict["prism"])

In [9]:
# export to csv

for key in raw_dict.keys():
    raw_dict[key].to_csv(f"../data/raw/{key}.csv", index=False)