In [None]:
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
import tiktoken
import numpy as np
import os
import torch

In [None]:
path = Path('/home/ubuntu/data/')

In [None]:
# init the tokenizer
enc = tiktoken.get_encoding('cl100k_base')
eot = enc._special_tokens['<|endoftext|>'] # end of text token

def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint32 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**32).all(), "token dictionary too large for uint32"
    tokens_np_uint32 = tokens_np.astype(np.uint32)
    return tokens_np_uint32

In [None]:
def write_datafile(dataset, split, tokens_np, shard_index):
    filename = path / f'{split}' / f'{split}_{dataset}_{shard_index:06d}'
    np.save(filename, tokens_np)

In [None]:
def tokenize_dataset(data_loader, dataset_name, split, shard_size=int(1e6),max_shards=-1):
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint32)
    token_count = 0
    progress_bar = None
    
    for example in data_loader:
        tokens = tokenize(example)
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            # write the file
            write_datafile(dataset_name, split, all_tokens_np, shard_index)
            # reset 
            shard_index += 1
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder
            #reset progress bar 
            progress_bar.close()
            progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'{dataset_name}_{split} | Shard {shard_index}')
            progress_bar.update(token_count)
            # break on max_shards
            if (max_shards>0 and shard_index > max_shards):
                break
    
    # write any remaining tokens as the last shard
    if token_count != 0:
        all_tokens_np = all_tokens_np[0:token_count]
        write_datafile(dataset_name, split, all_tokens_np, shard_index)
    
    
        

## Fineweb

In [None]:
fw_tr = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)

In [None]:
tokenize_dataset(fw_tr, 'fineweb-edu', 'train', max_shards=10)

## Wikitext

In [None]:
wt_tr = load_dataset('iohadrubin/wikitext-103-raw-v1', split='train', streaming=True)
tokenize_dataset(wt_tr, 'wikitext-103', 'train', max_shards=10)

In [None]:
wt_ts = load_dataset('iohadrubin/wikitext-103-raw-v1', split='test', streaming=True)
tokenize_dataset(wt_ts, 'wikitext-103', 'test')

In [None]:
import random
train_data = Path('/home/ubuntu/data/train')
shards = sorted(list(train_data.iterdir()))
shards

## Testing iterating

In [1]:
from datasets import load_dataset
from datasets import interleave_datasets

In [2]:
fw_loader = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

In [3]:
sp_loader = load_dataset('cerebras/SlimPajama-627B', split='train', streaming=True)

Resolving data files:   0%|          | 0/59166 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31428 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31411 [00:00<?, ?it/s]

In [4]:

# data_loader1, data_loader2 are IterableDataset objects (streaming=True)
merged = interleave_datasets(
        [fw_loader, sp_loader],
        probabilities=[0.33, 0.67],          # 1:2 coinflip
        seed=1667,                           
        stopping_strategy='first_exhausted' # stop as soon as fineweb runs out)
    )

i = 1

for example in merged:
    print(type(example['text']))
    print(example['text'])
    i += 1
    print(i)
    if i > 10:
        break

<class 'str'>
J.J. Abrams Returns To Write And Direct 'Star Wars: Episode IX'
09/12/2017 11:11 am ET Updated Sep 12, 2017
The return of the J.J.
By Bill Bradley
UPDATE: 4:00 p.m. ET — In addition to the director news, "Star Wars" announced that the premiere date for "Episode IX" will be Dec. 20, 2019.
Star Wars: Episode IX is scheduled for release on December 20, 2019. pic.twitter.com/rDBqmuHX89
— Star Wars (@starwars) September 12, 2017
The Force was with J.J. Abrams when he launched the new set of "Star Wars" films with "The Force Awakens," so now Disney is bringing him back.
As Deadline reported on Tuesday, and according to a press release on StarWars.com, Abrams will return to write and direct "Star Wars: Episode IX." The statement reads:
A post shared by Star Wars (@starwars) on Sep 12, 2017 at 7:28am PDT
After Disney unexpectedly parted ways with former "Episode IX" director Colin Trevorrow earlier this month, rumors that Rian Johnson, who is directing "Star Wars: The Last Jedi,"

In [5]:
fw2 = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)
sp2 = load_dataset('cerebras/SlimPajama-627B', split='train', streaming=True)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59166 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31428 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31411 [00:00<?, ?it/s]

In [6]:
x = 1

for example in fw2:
    print(example['text'])
    x += 1
    print(x)
    if x > 5:
        break

The Independent Jane
For all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom and independence. Independence of thought and the freedom to choose.
Elizabeth’s refusal of Mr. Collins offer of marriage showed an independence seldom seen in heroines of the day. Her refusal of Mr. Darcy while triggered by anger showed a level of independence that left him shocked and stunned.
The freedom she exhibited in finally accepting him in direct defiance of Lady Catherine and knowing her father would disapprove was unusual even for Austen. In her last book Anne Elliot is persuaded to refuse Captain Wentworth at Lady Russel’s insistence.
Although Jane played by the rules of the day, all of her writing is infused with how she wanted life to be. She ‘screams’ her outrage at the limitations for women in Emma.
When accosted by Mrs. Elton, Jane Fairfax says,
“Excuse me, ma’am, but this is by no means my intention; I make no inquiry myself, and should be sorry to 

In [7]:
x = 1

for example in sp2:
    print(example['text'])
    x += 1
    print(x)
    if x > 5:
        break

J.J. Abrams Returns To Write And Direct 'Star Wars: Episode IX'
09/12/2017 11:11 am ET Updated Sep 12, 2017
The return of the J.J.
By Bill Bradley
UPDATE: 4:00 p.m. ET — In addition to the director news, "Star Wars" announced that the premiere date for "Episode IX" will be Dec. 20, 2019.
Star Wars: Episode IX is scheduled for release on December 20, 2019. pic.twitter.com/rDBqmuHX89
— Star Wars (@starwars) September 12, 2017
The Force was with J.J. Abrams when he launched the new set of "Star Wars" films with "The Force Awakens," so now Disney is bringing him back.
As Deadline reported on Tuesday, and according to a press release on StarWars.com, Abrams will return to write and direct "Star Wars: Episode IX." The statement reads:
A post shared by Star Wars (@starwars) on Sep 12, 2017 at 7:28am PDT
After Disney unexpectedly parted ways with former "Episode IX" director Colin Trevorrow earlier this month, rumors that Rian Johnson, who is directing "Star Wars: The Last Jedi," would take ov