In [None]:
!pip install datasets

In [None]:
import datasets
import pandas as pd
from tqdm import tqdm
from huggingface_hub import notebook_login

In [None]:
open_assistant = datasets.load_dataset("OpenAssistant/oasst1")
open_assistant = datasets.concatenate_datasets([open_assistant['train'], open_assistant['validation']])

In [None]:
df = open_assistant.to_pandas()

all_parents = df['parent_id'].unique()
all_children = df['message_id'].unique()
leaf_ids = [message_id for message_id in tqdm(all_children) if message_id not in all_parents]

In [None]:
print(len(leaf_ids))

In [None]:
leaf_ids = []
for i, row in tqdm(df.iterrows()):
    if row['message_id'] not in all_parents:
        leaf_ids.append((row['message_id'], row['user_id'], row['created_date'], row['lang']))
        
print("Length:", len(leaf_ids), " and Content:", leaf_ids[1])

Check that the dataset has a tree structure, with root nodes being parents to more conversations

In [None]:
df['parent_id'].value_counts().value_counts()

In [None]:
df.iloc[:].head(30)

In [None]:
not_found = []

def find_parents(message_id):
    rows = df[df['message_id'] == message_id]
    if len(rows) == 0:
        not_found.append(message_id)
        return []
    
    row = rows.iloc[0]
    parent_id = row['parent_id']
    content = {'content': row['text'], 'role': ('user' if row['role']=='prompter' else row['role'])}
    
    if parent_id is None: # stop chain
        return [content]
    
    return find_parents(parent_id) + [content]
    
dataset = pd.DataFrame([], columns=['conversation_id', 'user_id', 'created_date', 'messages', 'lang'])

for leaf_id, user_id, created_date, lang in tqdm(leaf_ids):
    dataset.loc[len(dataset)] = leaf_id, user_id, created_date, find_parents(leaf_id), lang

In [None]:
ds = datasets.Dataset.from_pandas(dataset)
ds = ds.remove_columns("__index_level_0__")

In [None]:
notebook_login()
ds.push_to_hub("A-Roucher/Open_Assistant_Conversation_Chains")