In [4]:
import pandas as pd
import os
import glob

def convert_parquet_to_txt(dataset_name, text_key="text"):
    text = ""
    pattern = f"wikitext/wikitext-103-raw-v1/{dataset_name}-*-of-*.parquet"
    parquet_files = glob.glob(pattern)
    
    for parquet_file in parquet_files:
        df = pd.read_parquet(parquet_file)
        text += "".join(df[text_key])
        
    os.makedirs("wikitext/wikitext-103-raw-v1-txt", exist_ok=True)
    output_path = f"wikitext/wikitext-103-raw-v1-txt/{dataset_name}.txt"
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)

convert_parquet_to_txt("train")
convert_parquet_to_txt("validation") 
convert_parquet_to_txt("test")

In [None]:
from datasets import load_dataset, Dataset, DatasetDict

def create_text(entry):
    text = \
        entry["question"] + '\n' + \
        entry["gemini_thinking_trajectory"] + \
        entry["gemini_attempt"]
    return {"text": text}

def split_and_save_ds(ds: Dataset, ds_name, text_key="text"):
    # Split into train/val/test
    ds: DatasetDict = ds.train_test_split(test_size=0.2, seed=42)
    test_valid: DatasetDict = ds["test"].train_test_split(test_size=0.5, seed=42)

    train_ds = ds["train"]
    valid_ds = test_valid["train"] 
    test_ds = test_valid["test"]

    # Write to txt files
    os.makedirs(ds_name, exist_ok=True)

    with open(f"{ds_name}/train.txt", "w", encoding="utf-8") as f:
        f.write("".join(train_ds[text_key]))
        
    with open(f"{ds_name}/validation.txt", "w", encoding="utf-8") as f:
        f.write("".join(valid_ds[text_key]))
        
    with open(f"{ds_name}/test.txt", "w", encoding="utf-8") as f:
        f.write("".join(test_ds[text_key]))

ds = load_dataset("simplescaling/s1K-1.1")["train"]
ds = ds.map(create_text)
split_and_save_ds(ds, "s1")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 7599.71 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3456.22 examples/s]


In [None]:
def create_text(entry):
    text = ""
    for message in entry["conversations"]:
        text += message["value"]
    return {"text": text}

ds = load_dataset("open-thoughts/OpenThoughts3-1.2M")["train"]
ds = ds.map(create_text)
split_and_save_ds(ds, "open_thoughts")

In [None]:
split_and_save_ds(ds, "open_thoughts")