In [1]:
from datasets import load_dataset

num_samples_to_take = 1000_000
dataset_name = "monology/pile-uncopyrighted"
ds = load_dataset(dataset_name, split="train", streaming=True)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [2]:
import tqdm
# get the data from the dataset by streaming mode
raw_data = []
progress_bar = tqdm.tqdm(total=num_samples_to_take)
for i, sample in enumerate(ds):
    if i >= num_samples_to_take:
        break
    raw_data.append(sample)
    progress_bar.update(1)

100%|█████████▉| 997054/1000000 [02:27<00:00, 6071.72it/s] 

In [3]:
# select 16_000 from the data
# encode with gpt-neo tokenizer
from transformers import AutoTokenizer
import random
random.seed(42)

# if the character count is less than 500, skip the sample
data = [sample for sample in raw_data if len(sample["text"]) > 2000 and sample["meta"]["pile_set_name"] == "Pile-CC"]
data = random.sample(data, 16_000)

pythia_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
# randomly select 96 tokens for each sample
def tokenize_function(examples):
    tokens = pythia_tokenizer.encode(examples["text"])
    n = len(tokens)
    cut_point = random.randint(0, n - 96)
    tokens = tokens[cut_point:cut_point + 96]
    text = pythia_tokenizer.decode(tokens)
    return text

# tokenize the data
truncated_data = []
for sample in tqdm.tqdm(data):
    tokens = tokenize_function(sample)
    truncated_data.append(tokens)

100%|██████████| 16000/16000 [01:14<00:00, 214.38it/s]it/s]


In [6]:

# save
import json
with open("pile-16k-random.jsonl", "w") as f:
    for sample in truncated_data:
        f.write(json.dumps({"text": sample, "meta": {"pile_set_name": "Pile-CC"}}) + "\n")