In [1]:
!pip install datasets



In [2]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from datasets import load_dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset("OpenAssistant/oasst1")
train = ds['train']
val = ds['validation']

train = pd.DataFrame(train)
val = pd.DataFrame(val)

def prep_data(df):
    df_assistant = df[(df.role=="assistant") & (df["rank"]==0.0)]
    df_prompter = df[(df.role=="prompter")]
    df_prompter = df_prompter.set_index("message_id")
    df_assistant["output"] = df_assistant["text"].values

    inputs = []
    for idx, row in df_assistant.iterrows():
        input = df_prompter.loc[row.parent_id]
        inputs.append(input.text)

    df_assistant["instruction"] = inputs

    df_assistant = df_assistant[df_assistant.lang=="en"]

    df_assistant = df_assistant[["instruction", "output"]]

    return df_assistant

df_train = prep_data(train)
df_val = prep_data(val)


Downloading readme: 100%|██████████| 10.2k/10.2k [00:00<00:00, 2.54MB/s]


Downloading and preparing dataset None/None to /Users/tsiameh/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 39.5M/39.5M [00:01<00:00, 32.9MB/s]
Downloading data: 100%|██████████| 2.08M/2.08M [00:00<00:00, 26.4MB/s]
Downloading data files: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 750.79it/s]
                                                                                        

Dataset parquet downloaded and prepared to /Users/tsiameh/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 108.09it/s]


In [5]:
# train
df_train.reset_index(drop=True).to_csv("train.csv", index=False)

# valid
df_val.reset_index(drop=True).to_csv("val.csv", index=False)

In [6]:
# full data
pd.concat([df_train, df_val]).reset_index(drop=True).to_csv("train_full.csv", index=False)

In [8]:
df = pd.read_csv("train_full.csv")
df.head()

Unnamed: 0,instruction,output
0,Can you write a short introduction about the r...,"""Monopsony"" refers to a market structure where..."
1,What can be done at a regulatory level to ensu...,Here are some potential regulatory options to ...
2,Can you explain contrastive learning in machin...,Sure! Let's say you want to build a model whic...
3,I didn't understand how pulling and pushing wo...,It has to do with a process called differentia...
4,I want to start doing astrophotography as a ho...,Getting started in astrophotography can seem d...


In [10]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")

In [11]:
def create_jsonl_from_dataframe(df, output_file):
    # Initialize an empty list to store the formatted data
    jsonl_data = []

    # Iterate through the rows of the DataFrame
    for index, row in df.iterrows():
        # Format the data into a dictionary
        data_dict = {
            "prompt": f"Human: {row['instruction']} Assistant:",
            "chosen": row['output']
        }

        # Append the dictionary to the list
        jsonl_data.append(data_dict)

    # Write the list of dictionaries to a .jsonl file
    with open(output_file, 'w') as jsonl_file:
        for item in jsonl_data:
            jsonl_file.write(json.dumps(item) + '\n')

In [15]:
# Specify the output file path
train_output_file = "train.jsonl"
val_output_file = "val.jsonl"

# Create .jsonl file from DataFrame
create_jsonl_from_dataframe(train, train_output_file)

create_jsonl_from_dataframe(val, val_output_file)

### Test

In [14]:
# Example DataFrame (replace this with your actual DataFrame)
data = {'human': ['Who are you?', 'What is your purpose?'],
        'assistant': ["I'm Yi.", "I'm here to assist you."]}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,human,assistant
0,Who are you?,I'm Yi.
1,What is your purpose?,I'm here to assist you.
