In [35]:
import os, requests, zipfile, re
from collections import Counter

In [None]:
dataset_url = "https://github.com/entropicemergence/tiny_llm_server/releases/download/v0.1.0/TinyStoriesV2-GPT4-small.zip"
datset_folder = "stories"
os.makedirs(datset_folder, exist_ok=True)

def download_dataset(url):
    filename = url.split("/")[-1]
    filepath = os.path.join(datset_folder, filename)
    if not os.path.exists(filepath):
        response = requests.get(url)
        with open(filepath, "wb") as f:
            f.write(response.content)
download_dataset(dataset_url)

In [15]:
filepath = "stories/TinyStoriesV2-GPT4-small.zip"
with zipfile.ZipFile(filepath, "r") as zip_ref:
    zip_ref.extractall(datset_folder)
with open("stories/TinyStoriesV2-GPT4-small.txt", "r", encoding="utf-8") as f:
    stories_text = f.read()

In [None]:
class Tokenizer:
    def __init__(self, text_source):
        self.text_source = text_source
        self.vocab_size = 4000
        self.special_tokens = {'<PAD>':0, '<UNKNOWN>':1, '<BOS>':2, '<EOS>':3, 'CHAR_START':4, 'CHAR_END':5}
        self.word_to_id = {}
        self.id_to_word = {}
        self.char_to_id = {}
        self.id_to_char = {}

    def build_vocab(self):
        print (len(self.text_source))
        grouped_stories = []
        story = ""
        for line in self.text_source.split("\n"):
            if line == "<|endoftext|>":
                # story+="\n==============\n"
                grouped_stories.append(story)   
                story = ""
            else:
                story += line + "\n"
        all_words = []
        all_chars = set()
        for story in grouped_stories:
            story = story.lower()
            all_chars.update(story)

            punctuation = r'[.,!?;:"\'\-\(\)\[\]{}]'
            story = re.sub(punctuation, r' \g<0> ', story) #Replace punctuation with space + punctuation + space
            tokens = [token for token in story.split() if token.strip()]
            all_words.extend(tokens)

        
        word_counts = Counter(all_words)
        # print (len(all_words))
        # print (word_counts)
        start_id = len(self.special_tokens)
        for word, count in word_counts.most_common(self.vocab_size):
            self.word_to_id[word] = start_id
            self.id_to_word[start_id] = word
            start_id += 1
        # print (self.word_to_id)
        # print (self.id_to_word)
        # print (all_chars)
        for char in all_chars:
            self.char_to_id[char] = start_id
            self.id_to_char[start_id] = char
            start_id += 1
        # print (self.char_to_id)
        # print (self.id_to_char)
    def encode(self, text):
        text = text.lower()
        text = re.sub(punctuation, r' \g<0> ', text)






simple_tokenizer = Tokenizer(stories_text)
simple_tokenizer.build_vocab()










22493387
5354125
{'0', '4', ',', 't', '9', 'x', 'k', 's', '‘', '!', '8', '6', ' ', 'u', 'd', '\x94', 'f', '’', '…', 'y', '.', '”', 'a', 'z', 'p', '5', '7', 'r', '—', '–', 'n', 'b', 'i', 'l', 'h', 'g', '2', 'o', 'm', 'w', 'q', '`', "'", '/', '1', '"', '=', 'ñ', 'v', '3', '“', '\n', '-', ':', ';', 'j', '\x93', 'é', 'e', '\x92', '?', 'c'}
{'0': 4006, '4': 4007, ',': 4008, 't': 4009, '9': 4010, 'x': 4011, 'k': 4012, 's': 4013, '‘': 4014, '!': 4015, '8': 4016, '6': 4017, ' ': 4018, 'u': 4019, 'd': 4020, '\x94': 4021, 'f': 4022, '’': 4023, '…': 4024, 'y': 4025, '.': 4026, '”': 4027, 'a': 4028, 'z': 4029, 'p': 4030, '5': 4031, '7': 4032, 'r': 4033, '—': 4034, '–': 4035, 'n': 4036, 'b': 4037, 'i': 4038, 'l': 4039, 'h': 4040, 'g': 4041, '2': 4042, 'o': 4043, 'm': 4044, 'w': 4045, 'q': 4046, '`': 4047, "'": 4048, '/': 4049, '1': 4050, '"': 4051, '=': 4052, 'ñ': 4053, 'v': 4054, '3': 4055, '“': 4056, '\n': 4057, '-': 4058, ':': 4059, ';': 4060, 'j': 4061, '\x93': 4062, 'é': 4063, 'e': 4064, '\x92