In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE())

In [2]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [3]:
%%time

from tokenizers.trainers import BpeTrainer

data_folder = "../data/TinyStories/"
files = [
    data_folder + "TinyStoriesV2-GPT4-train.txt",
    # data_folder + "TinyStoriesV2-GPT4-valid.txt",
]

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=files, trainer=trainer)




CPU times: user 12min 4s, sys: 24.2 s, total: 12min 28s
Wall time: 1min 10s


In [4]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '?']


In [5]:
tokenizer.get_vocab_size()

30000

In [16]:
%%time

valid_file = "../data/TinyStories/TinyStoriesV2-GPT4-valid.txt"
with open(valid_file, "r", encoding="utf-8") as f:
    valid_text = f.read()


CPU times: user 62.8 ms, sys: 31.4 ms, total: 94.2 ms
Wall time: 90 ms


In [17]:
valid_text[:1000]

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n<|endoftext|>\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high and low, but they could not find the ball. Tom said, "I think my ball fell into the pit."\nSam and Tom went close to the pit. They were scared, but they wanted to find the red ball. They looked into the pit, but it was too dark to see. Tom said, "We must go in and search for my ball."\nThey went into the pit to search. It was dark and scary. They could not find the ball. They tried to get out, but the pit was too deep. Tom and Sam were stuck in the pit. They called for help, but no one coul

In [8]:
tokenizer.add_special_tokens(["<|endoftext|>", "\n"])

2

In [18]:
output = tokenizer.encode(valid_text[:1000])
print(output.tokens)
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]

['u', 'don', "'", 't', 'have', 'to', 'be', 'scared', 'of', 'the', 'loud', 'dog', ',', 'I', "'", 'll', 'protect', 'you', '".', 'The', 'mole', 'felt', 'so', 'safe', 'with', 'the', 'little', 'girl', '.', 'She', 'was', 'very', 'kind', 'and', 'the', 'mole', 'soon', 'came', 'to', 'trust', 'her', '.', 'He', 'leaned', 'against', 'her', 'and', 'she', 'kept', 'him', 'safe', '.', 'The', 'mole', 'had', 'found', 'his', 'best', 'friend', '.', '\n', '<|endoftext|>', '\n', 'Once', 'upon', 'a', 'time', ',', 'in', 'a', 'warm', 'and', 'sunny', 'place', ',', 'there', 'was', 'a', 'big', 'pit', '.', 'A', 'little', 'boy', 'named', 'Tom', 'liked', 'to', 'play', 'near', 'the', 'pit', '.', 'One', 'day', ',', 'Tom', 'lost', 'his', 'red', 'ball', '.', 'He', 'was', 'very', 'sad', '.', '\n', 'Tom', 'asked', 'his', 'friend', ',', 'Sam', ',', 'to', 'help', 'him', 'search', 'for', 'the', 'ball', '.', 'They', 'looked', 'high', 'and', 'low', ',', 'but', 'they', 'could', 'not', 'find', 'the', 'ball', '.', 'Tom', 'said', 

In [10]:
tokenizer.save("TinyStories_tokenizer.json")

In [11]:
tokenizer.from_file("TinyStories_tokenizer.json")

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"[UNK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"[CLS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"[SEP]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":3, "content":"[PAD]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":4, "content":"[MASK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":30000, "content":"<|endoftext|>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":30001, "content":"
", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=None, pre_tokenizer=Whitespace(), post_processor=None, decoder=None, model=BPE(

In [12]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '?']
