### Download, tokenize, and split datasets


1. Wikitext - `load_dataset("iohadrubin/wikitext-103-raw-v1")`
2. Fineweb - `load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)`
3. Slimpajama `load_dataset("cerebras/SlimPajama-627B")` ~switchted to 6B

In [1]:
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
import tiktoken
import numpy as np
import os
import torch

In [2]:
test = Path("~").expanduser() / 'data/gpt_train_data/tokenized'
test

PosixPath('/Users/djemec/data/gpt_train_data/tokenized')

In [2]:
tok_path = Path('/Users/djemec/data/gpt_train_data/tokenized')

### Setup tokenizer

In [3]:
enc = tiktoken.get_encoding('cl100k_base')

In [6]:
enc.max_token_value

100276

In [7]:
enc._special_tokens

{'<|endoftext|>': 100257,
 '<|fim_prefix|>': 100258,
 '<|fim_middle|>': 100259,
 '<|fim_suffix|>': 100260,
 '<|endofprompt|>': 100276}

In [3]:
# init the tokenizer
enc = tiktoken.get_encoding('cl100k_base')
eot = enc._special_tokens['<|endoftext|>'] # end of text token

def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint16 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
    tokens_np_uint16 = tokens_np.astype(np.uint32)
    return tokens_np_uint16

In [4]:
def write_datafile(dataset, split, tokens_np, path, shard_index):
    filename = path / f'{dataset}_{split}_{shard_index:06d}'
    np.save(filename, tokens_np)

In [5]:
def tokenize_dataset(data_loader, dataset_name, split, shard_size=int(1e8)):
    path = tok_path / f'{dataset_name}'
    path.mkdir(parents=True, exist_ok=True)
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
    token_count = 0
    progress_bar = None
    
    for example in data_loader:
        tokens = tokenize(example)
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            # write the file
            write_datafile(dataset_name, split, all_tokens_np, path, shard_index)
            # reset 
            shard_index += 1
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder
            #reset progress bar 
            progress_bar.close()
            progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(token_count)
    
    # write any remaining tokens as the last shard
    if token_count != 0:
        all_tokens_np = all_tokens_np[0:token_count]
        write_datafile(dataset_name, split, all_tokens_np, path, shard_index)
    
    
        

In [6]:
def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

#### Wikitext

In [None]:
wt_tr = load_dataset('iohadrubin/wikitext-103-raw-v1', split='train', streaming=True)
tokenize_dataset(wt_tr, 'wikitext-103', 'train')

In [None]:
wt_ts = load_dataset('iohadrubin/wikitext-103-raw-v1', split='test', streaming=True)
tokenize_dataset(wt_ts, 'wikitext-103', 'test')

In [None]:
wt_val = load_dataset('iohadrubin/wikitext-103-raw-v1', split='validation', streaming=True)
tokenize_dataset(wt_val, 'wikitext-103', 'val')

## Fineweb

In [None]:
fw_tr = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)

In [None]:
tokenize_dataset(fw_tr, 'fineweb-edu', 'train')

## SlimPajama

In [None]:
# switched to 6B set that's sampled
sp_tr = load_dataset('DKYoon/SlimPajama-6B', split='train', streaming=True)

In [None]:
tokenize_dataset(sp_tr, 'slimpajama-6B', 'train')

In [None]:
sp_ts = load_dataset('DKYoon/SlimPajama-6B', split='test', streaming=True)
tokenize_dataset(sp_ts, 'slimpajama-6B', 'test')

In [None]:
sp_val = load_dataset('DKYoon/SlimPajama-6B', split='validation', streaming=True)
tokenize_dataset(sp_val, 'slimpajama-6B', 'val')

## Decoding

In [10]:
path = tok_path / 'slimpajama-6B' / 'slimpajama-6B_train_000005.npy'

In [11]:
decoding_toks = load_tokens(path)


In [20]:
decoding_toks.shape

torch.Size([100000000])

In [27]:
enc.decode(decoding_toks[:10000].tolist())

