In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/521.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/521.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (1

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from datasets import load_dataset
import json

In [None]:
ds = load_dataset("OpenAssistant/oasst1")
train = ds['train']
val = ds['validation']

train = pd.DataFrame(train)
val = pd.DataFrame(val)

def prep_data(df):
    df_assistant = df[(df.role=="assistant") & (df["rank"]==0.0)]
    df_prompter = df[(df.role=="prompter")]
    df_prompter = df_prompter.set_index("message_id")
    df_assistant["output"] = df_assistant["text"].values

    inputs = []
    for idx, row in df_assistant.iterrows():
        input = df_prompter.loc[row.parent_id]
        inputs.append(input.text)

    df_assistant["instruction"] = inputs

    df_assistant = df_assistant[df_assistant.lang=="en"]

    df_assistant = df_assistant[["instruction", "output"]]

    return df_assistant

df_train = prep_data(train)
df_val = prep_data(val)


In [None]:
# train
df_train.reset_index(drop=True).to_csv("train.csv", index=False)

# valid
df_val.reset_index(drop=True).to_csv("val.csv", index=False)

In [None]:
# full data
pd.concat([df_train, df_val]).reset_index(drop=True).to_csv("train_full.csv", index=False)

In [None]:
df = pd.read_csv("/content/train_full.csv")
df.head()

Unnamed: 0,instruction,output
0,Can you write a short introduction about the r...,"""Monopsony"" refers to a market structure where..."
1,What can be done at a regulatory level to ensu...,Here are some potential regulatory options to ...
2,Can you explain contrastive learning in machin...,Sure! Let's say you want to build a model whic...
3,I didn't understand how pulling and pushing wo...,It has to do with a process called differentia...
4,I want to start doing astrophotography as a ho...,Getting started in astrophotography can seem d...


In [None]:
train = pd.read_csv("/content/train.csv")
val = pd.read_csv("/content/val.csv")

In [None]:
def create_jsonl_from_dataframe(df, output_file):
    # Initialize an empty list to store the formatted data
    jsonl_data = []

    # Iterate through the rows of the DataFrame
    for index, row in df.iterrows():
        # Format the data into a dictionary
        data_dict = {
            "prompt": f"Human: {row['instruction']} Assistant:",
            "chosen": row['output']
        }

        # Append the dictionary to the list
        jsonl_data.append(data_dict)

    # Write the list of dictionaries to a .jsonl file
    with open(output_file, 'w') as jsonl_file:
        for item in jsonl_data:
            jsonl_file.write(json.dumps(item) + '\n')

In [None]:
# Specify the output file path
output_file = "train.jsonl"

# Create .jsonl file from DataFrame
create_jsonl_from_dataframe(train, output_file)

In [None]:
# Specify the output file path
output_file = "val.jsonl"

# Create .jsonl file from DataFrame
create_jsonl_from_dataframe(val, output_file)

### Test

In [None]:
# Example DataFrame (replace this with your actual DataFrame)
data = {'human': ['Who are you?', 'What is your purpose?'],
        'assistant': ["I'm Yi.", "I'm here to assist you."]}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,human,assistant
0,Who are you?,I'm Yi.
1,What is your purpose?,I'm here to assist you.
