In [1]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_from_disk # huggingface datasets
num_proc = 8
enc = tiktoken.get_encoding("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_from_disk("/data2/mengfanxu/CLOVer/output/wikitext-2-raw-v1")

In [3]:
def process(example):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
    out = {'ids': ids, 'len': len(ids)}
    return out
dataset['val'] = dataset.pop('validation')
# tokenize the dataset
tokenized = dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

In [4]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(f'dataset/{split}.bin')
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

writing dataset/test.bin: 100%|██████████| 1024/1024 [00:01<00:00, 749.07it/s]
writing dataset/train.bin: 100%|██████████| 1024/1024 [00:01<00:00, 689.57it/s]
writing dataset/val.bin: 100%|██████████| 1024/1024 [00:01<00:00, 753.31it/s]
