In [1]:
import datasets

dataset = datasets.load_dataset("roneneldan/TinyStories")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace


tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

In [3]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase, BertNormalizer, Replace

tokenizer.normalizer = normalizers.Sequence([
    BertNormalizer(),
    NFD(),
    Replace("’", "'"),
    Replace("‘", "'"),
    Replace("“", '"'),
    Replace("”", '"'),
    Replace("–", "-"),
    Replace("—", "-"),
    Replace("…", "..."),
    Replace("´", "'"),
    Replace("`", "'"),
    Replace("一", "-"),
    # Replace("🌴", ""),
    # Replace("🍌", ""),
    Replace("─", "-"),
    # Replace("€", ""),
    # Replace("™", ""),
    # Replace("œ", ""),
    # Replace("˜", ""),
    # Replace("¦", ""),
    # Replace(r'[\u4E00-\u9FFF]', ''), # Remove all CJK Unified Ideographs (and other non-ASCII)
    # Replace(r'[^\x00-\x7F]', ''), # Remove non-ASCII (Chinese, etc.)
    # Lowercase(),
    # StripAccents()
])

In [4]:
test_string = "Hello, y'all! How are you 😁 <|endoftext|> \n? 奮 些 ä Héllø 中国 123! 巴  恩  和  艾  米  莉  兩  兒  童  在  一  個  玉  米  田  裡  度  過  了  一  整  天"
tokenizer.normalizer.normalize_str(test_string)

"hello, y'all! how are you 😁 <|endoftext|>  ?  奮   些  a hellø  中  国  123!  巴    恩    和    艾    米    莉    兩    兒    童    在    -    個    玉    米    田    裡    度    過    了    -    整    天 "

In [None]:
from tokenizers.trainers import BpeTrainer


# allowed chars chould only be utf-8 basic latin set
allowed_chars = set(
    "abcdefghijklmnopqrstuvwxyz"
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "0123456789"
    ".,;:!?\"'()[]{}<>@#$%^&*+-=~`|\\/"
    "\n\t\r ¡"
)

trainer = BpeTrainer(
    special_tokens=["<|endoftext|>", "\n", "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=4096,
    min_frequency=2,
    show_progress=True,
)

def filter_text(text):
    return ''.join(c for c in text if c in allowed_chars)

def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset["train"]), batch_size):
        print(f"Processing batch {i} to {i + batch_size}")
        yield [filter_text(text) for text in dataset["train"][i:i + batch_size]["text"]]

tokenizer.train_from_iterator(
    batch_iterator(),
    trainer=trainer,
)






In [10]:
output = tokenizer.encode("Hello, y'all! How are you 😁 <|endoftext|> \n? 奮 些 ä Héllø 中国 123!")
print(output.tokens)
print(output.ids)

['hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '<|endoftext|>', '\n', '?', 'a', 'hel', 'l', '1', '2', '3', '!']
[820, 18, 66, 13, 161, 7, 289, 172, 114, 0, 1, 37, 42, 225, 53, 23, 24, 25, 7]


In [13]:
vocab_size = tokenizer.get_vocab_size()
print(f"Vocab size: {vocab_size}")

Vocab size: 4096


In [17]:
print(tokenizer.decode(list(range(vocab_size, vocab_size - 100, -1)), skip_special_tokens=False))
print(tokenizer.decode(list(range(0, 100)), skip_special_tokens=False))

blackboard lemonade frank jewels dried spending prepare abig bumped insisted sparkling vanished thumb compl jail teaching none ining harbor prevent icing received socks refrigerator weap supply earlier refriger spots friger snapped betsy beamed saving dare '. bathrobe direct tsy ashtray asht steve gers spade oper invit skills raise plates sandcastle rachel myself relaxed admir bers reaches apron buried strawberries automob ignore autom root surf someday somed float pages cham pastel hurting hearts buckle wandered brain junk laughter calendar sparkles laught oasis expected whip structure understanding safety offic surprises oas pastry bravely moments ollie fallen married festival calend weep vehic
<|endoftext|> 
 [UNK] [CLS] [SEP] [PAD] [MASK] ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ [ \ ] a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ ¡ he the an ed to and in it re ou as on ha er was ar en ay she ing om is il id sa le they


In [21]:
tokenizer.get_vocab()

{'toby': 2650,
 'mar': 681,
 'wolf': 2034,
 'ense': 3537,
 'sor': 402,
 'mum': 602,
 'henry': 3058,
 'in': 79,
 'sty': 2551,
 'once': 203,
 'fall': 1138,
 'bra': 559,
 'nur': 2646,
 'behind': 995,
 'together': 311,
 'miss': 1477,
 'fingers': 2610,
 'tory': 2254,
 'shampoo': 3931,
 'ible': 1436,
 'bel': 1438,
 'important': 766,
 'ate': 395,
 'chicken': 2326,
 'dependable': 2612,
 'igator': 3163,
 'ert': 2470,
 'chase': 2360,
 'wel': 894,
 'sign': 1321,
 'chang': 2301,
 'uring': 2805,
 'lonely': 1469,
 'hot': 1070,
 'comes': 1539,
 'moder': 2700,
 'op': 307,
 'different': 817,
 'bought': 1666,
 'ouch': 3735,
 'ian': 1652,
 'wha': 2323,
 'pain': 859,
 'caterpill': 2100,
 'mirror': 2255,
 'fold': 1823,
 'le': 98,
 'ob': 1761,
 'cube': 2642,
 'avo': 2911,
 'fle': 571,
 'movie': 2546,
 'exclaimed': 2557,
 'volcano': 3361,
 'lila': 844,
 'mail': 1514,
 'sorts': 2918,
 'stra': 1910,
 'doors': 2882,
 'yogurt': 3293,
 'himself': 932,
 'imag': 1613,
 'crack': 2560,
 'xi': 3282,
 'diamond': 2738,


In [None]:
%%time



CPU times: user 8.19 ms, sys: 30.3 ms, total: 38.5 ms
Wall time: 43.6 ms


In [23]:
dataset["validation"][0]

{'text': 'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."\n\nAfter playing with the car, Kitty and Spot felt thirsty. They found a small pond with clear water. They drank the water and felt very happy. They played together all day and became best friends.'}

In [24]:
output = tokenizer.encode(dataset["validation"][0]["text"])
print(output.tokens)

['spot', '.', 'spot', 'saw', 'the', 'shiny', 'car', 'and', 'said', ',', '"', 'wow', ',', 'kitty', ',', 'your', 'car', 'is', 'so', 'bright', 'and', 'clean', '!"', 'kitty', 'smiled', 'and', 'replied', ',', '"', 'thank', 'you', ',', 'spot', '.', 'i', 'polish', 'it', 'every', 'day', '."', '\n', '\n', 'after', 'playing', 'with', 'the', 'car', ',', 'kitty', 'and', 'spot', 'felt', 'thirsty', '.', 'they', 'found', 'a', 'small', 'pond', 'with', 'clear', 'water', '.', 'they', 'drank', 'the', 'water', 'and', 'felt', 'very', 'happy', '.', 'they', 'played', 'together', 'all', 'day', 'and', 'became', 'best', 'friends', '.']


In [25]:
validation_encodings = tokenizer.encode_batch_fast(dataset["validation"]["text"])

In [26]:
lengths = [len(x.ids) for x in validation_encodings]

In [27]:
import numpy as np

# print stats
print("Mean:", np.mean(lengths))
print("Median:", np.median(lengths))
print("Min:", np.min(lengths))
print("Max:", np.max(lengths))
print("Std:", np.std(lengths))

Mean: 217.36330150068213
Median: 190.0
Min: 15
Max: 1121
Std: 102.0722346708042


In [28]:
tokenizer.save("TinyStories_tokenizer_small_cleaned.json")

In [29]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("TinyStories_tokenizer_small_cleaned.json")

In [32]:
tokenizer.encode("<|endoftext|>").tokens

['<|endoftext|>']

In [33]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)

['hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '?']


In [34]:
tokenizer.get_vocab_size()

4096

In [35]:
tokenizer.get_vocab()

{'party': 1264,
 'spoke': 2924,
 'yay': 2870,
 'quiet': 1267,
 'twirled': 3056,
 'rosie': 3664,
 'pile': 1696,
 'leop': 3155,
 'pow': 1379,
 'side': 259,
 'else': 1095,
 'pocket': 1486,
 'hands': 757,
 'swee': 2420,
 'slow': 1005,
 'origin': 2639,
 'touched': 1598,
 'buy': 1037,
 'want': 321,
 'ener': 3074,
 'freezer': 3782,
 'red': 455,
 'umber': 1830,
 'trum': 3131,
 'named': 273,
 'fant': 3844,
 'fre': 2128,
 'que': 1105,
 'relax': 2286,
 'learned': 545,
 'an': 75,
 'gloomy': 3029,
 'mummy': 1358,
 'weap': 4071,
 'tries': 1753,
 'usu': 2036,
 'charming': 2984,
 'wags': 3810,
 'brocco': 3138,
 'sounds': 1535,
 '5': 27,
 'talk': 1171,
 'trem': 3446,
 'clap': 2521,
 'ail': 2454,
 'amy': 1080,
 'pe': 286,
 'about': 407,
 'tow': 821,
 'self': 539,
 'shrimp': 3452,
 'flu': 1224,
 'laughing': 1617,
 'happier': 3256,
 'attach': 3966,
 'jake': 1728,
 'sil': 2358,
 'dest': 1738,
 'breathe': 3618,
 'seum': 3363,
 'large': 1908,
 'tucked': 3858,
 'blinked': 3904,
 'cel': 3379,
 'stars': 1491,
 