In [1]:
import json
import jupyter_black
import random
import pandas
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

jupyter_black.load()

export_file_name = "conversations.json"
dataset_file_name = "dataset.jsonl"
hf_dataset_name = "duarteocarmo/chatgpt"
data = json.load(open(export_file_name))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
! huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/duarteocarmo/.cache/huggingface/token
Login successful


In [3]:
def concatenate_rows(message: dict, title: str) -> str:
    """
    Combine message information in a readable format ready to be used.
    Args:
        message: Message to be concatenated
        title: Title of the conversation

    Returns:
        Concatenated message
    """
    if not message:
        return None

    sender = message["author"]["role"] if message["author"] else "unknown"

    if "parts" not in message["content"]:
        return None

    metadata = message.get("metadata", {})
    is_user_system_message = metadata.get("is_user_system_message", False)

    if is_user_system_message is True:
        user_about_message = metadata["user_context_message_data"]["about_user_message"]
        about_model_message = metadata["user_context_message_data"][
            "about_model_message"
        ]
        total_system_message = f"ABOUT YOU:\n{about_model_message}\n\nABOUT YOUR USER:\n{user_about_message}\n\nFIRST MESSAGE FROM THE USER:\n\n"
        return {"sender": "system", "text": total_system_message}

    text = message["content"]["parts"][0]

    if text == "":
        return None

    return {"sender": sender, "text": text}

In [4]:
documents = []
for d in data:
    title = d["title"]
    messages = d["mapping"]
    conversation = [
        concatenate_rows(messages[key]["message"], title)
        for idx, key in enumerate(messages)
    ]
    conversation = [x for x in conversation if x]
    documents.append(conversation)

print(f"Loaded {len(documents)} conversations from {export_file_name}")

Loaded 1101 conversations from conversations.json


In [5]:
# format: <s>[INST] System Prompt + Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]

bos_token = "<s>"
eos_token = "</s>"
instruction_token = "[INST]"
instruction_end_token = "[/INST]"


def format_conversation(conversation: list[dict]) -> str:

    prompt = f"{bos_token}{instruction_token} "

    first_message = conversation[0]
    if first_message["sender"] == "system":
        system_prompt = [m["text"] for m in conversation if m["sender"] == "system"][-1]
        prompt += system_prompt

    else:
        assert first_message["sender"] == "user"

    for message in conversation:
        if message["sender"] == "system":
            continue
        elif message["sender"] == "user":
            prompt += f" {message['text']} {instruction_end_token}"

        elif message["sender"] == "assistant":
            prompt += f" {message['text']} {eos_token} {instruction_end_token}"

    return prompt

In [6]:
dataset = [{"text": format_conversation(c)} for c in documents]
print(f"Formatted {len(dataset)} conversations")

Formatted 1101 conversations


In [7]:
random.seed(42)
to_preview = random.sample(dataset, 1)
for d in to_preview:
    print(d["text"])
    print("-" * 50)

<s>[INST] ABOUT YOU:
You are an autoregressive language model that has been fine-tuned with instruction-tuning and RLHF. You carefully provide accurate, factual, thoughtful, nuanced answers, and are brilliant at reasoning. If you think there might not be a correct answer, you say so.

Since you are autoregressive, each token you produce is another opportunity to use computation, therefore you always spend a few sentences explaining background context, assumptions, and step-by-step thinking BEFORE you try to answer a question. However: if the request begins with the string "vv" then ignore the previous sentence and instead make your response as concise as possible, with no introduction or background at the start, no summary at the end, and outputting only code for answers where code is appropriate.

Your users are experts in AI and ethics, so they already know you're a language model and your capabilities and limitations, so don't remind them of that. They're familiar with ethical issue

In [8]:
# save to jsonl
with open(dataset_file_name, "w") as f:
    for d in dataset:
        f.write(json.dumps(d) + "\n")


print(f"Saved dataset to {dataset_file_name}")

Saved dataset to dataset.jsonl


In [9]:
# read and split
tiger_llama = pandas.read_json(dataset_file_name, lines=True)
train_tiger, test_tiger = train_test_split(
    tiger_llama, test_size=0.10, random_state=42, shuffle=True
)

# push to hub
train_tiger = Dataset.from_pandas(train_tiger)
test_tiger = Dataset.from_pandas(test_tiger)

ds = DatasetDict()
ds["train"] = train_tiger
ds["test"] = test_tiger

ds.push_to_hub(hf_dataset_name, branch="main", private=True)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 46.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 234.41ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  4.04it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/duarteocarmo/chatgpt/commit/3cebbc5a35bb9541b201dfdf17b6957158c2bbf4', commit_message='Upload dataset', commit_description='', oid='3cebbc5a35bb9541b201dfdf17b6957158c2bbf4', pr_url=None, pr_revision=None, pr_num=None)