In [1]:
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
import tiktoken
import numpy as np
import os
import torch

In [2]:
path = Path('/home/ubuntu/data/')

In [3]:
# init the tokenizer
enc = tiktoken.get_encoding('cl100k_base')
eot = enc._special_tokens['<|endoftext|>'] # end of text token

def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint32 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**32).all(), "token dictionary too large for uint32"
    tokens_np_uint32 = tokens_np.astype(np.uint32)
    return tokens_np_uint32

In [4]:
def write_datafile(dataset, split, tokens_np, shard_index):
    filename = path / f'{split}' / f'{split}_{dataset}_{shard_index:06d}'
    np.save(filename, tokens_np)

In [5]:
def tokenize_dataset(data_loader, dataset_name, split, shard_size=int(1e6),max_shards=-1):
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint32)
    token_count = 0
    progress_bar = None
    
    for example in data_loader:
        tokens = tokenize(example)
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            # write the file
            write_datafile(dataset_name, split, all_tokens_np, shard_index)
            # reset 
            shard_index += 1
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder
            #reset progress bar 
            progress_bar.close()
            progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(token_count)
            # break on max_shards
            if (max_shards>0 and shard_index > max_shards):
                break
    
    # write any remaining tokens as the last shard
    if token_count != 0:
        all_tokens_np = all_tokens_np[0:token_count]
        write_datafile(dataset_name, split, all_tokens_np, shard_index)
    
    
        

## Fineweb

In [6]:
fw_tr = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

In [7]:
tokenize_dataset(fw_tr, 'fineweb-edu', 'train', max_shards=10)

fineweb-edu_train | Shard 0: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 809350.74tokens/s]
fineweb-edu_train | Shard 1: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 851218.79tokens/s]
fineweb-edu_train | Shard 2: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 816027.37tokens/s]
fineweb-edu_train | Shard 3: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 862546.85tokens/s]
fineweb-edu_train | Shard 4: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 835805.93tokens/s]
fineweb-edu_train | Shard 5: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 815815.96tokens/s]
fineweb-edu_train | Shard 6: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 839137.04tokens/s]
fineweb-edu_train | Shard 7: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 741277.02tokens/s]
fineweb-edu_train | Shard 8: 100%|███████████████████████████| 1000000/1000000 [00:01<00:00, 944660.51to

## Wikitext

In [8]:
wt_tr = load_dataset('iohadrubin/wikitext-103-raw-v1', split='train', streaming=True)
tokenize_dataset(wt_tr, 'wikitext-103', 'train', max_shards=10)

wikitext-103_train | Shard 0: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1586942.16tokens/s]
wikitext-103_train | Shard 1: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1647187.36tokens/s]
wikitext-103_train | Shard 2: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1674751.93tokens/s]
wikitext-103_train | Shard 3: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1596866.50tokens/s]
wikitext-103_train | Shard 4: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1621049.08tokens/s]
wikitext-103_train | Shard 5: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1626793.21tokens/s]
wikitext-103_train | Shard 6: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1589975.92tokens/s]
wikitext-103_train | Shard 7: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1658283.86tokens/s]
wikitext-103_train | Shard 8: 100%|█████████████████████████| 1000000/1000000 [00:00<00:00, 1637016.85to

In [9]:
wt_ts = load_dataset('iohadrubin/wikitext-103-raw-v1', split='test', streaming=True)
tokenize_dataset(wt_ts, 'wikitext-103', 'test')

wikitext-103_test | Shard 0:  28%|███████▍                   | 276095/1000000 [00:00<00:00, 1634496.93tokens/s]


In [None]:
import random
train_data = Path('/home/ubuntu/data/train')
shards = sorted(list(train_data.iterdir()))
shards