I am building my own LLM so first I am building a tokenizer of my own 

STEP 1 - CREATE THE TOKENS

In [None]:
with open("01 Harry Potter and the Sorcerers Stone.txt", "r", encoding="Utf-8") as f:
    raw_text = f.read()

print(len(raw_text)) #Total number of characters
print(raw_text[:99])

439478
M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly nor


In [None]:
import re  #library for splitting
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30]) # Remember this is a list having all the tokens

['M', 'r', '.', 'and', 'Mrs', '.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.']


STEP 2 - CREATE TOKEN ID FOR THIS TOKENS

In [8]:
#now we have to use vocabulary that is the sorted words
words = sorted(set(preprocessed))
vocab_size = len(words)
print(vocab_size)

7572


In [None]:
#Remember vocab is a mapping from tokens to tokenIDs
vocab = {token:integer for integer, token in enumerate(words)}

In [None]:
#enumerate - this I have used to assign integer values to the sorted words
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
("'", 1)
('(', 2)
(')', 3)
('*', 4)
(',', 5)
('-', 6)
('-bodied', 7)
('.', 8)
('1', 9)
('1473', 10)
('1637', 11)
('17', 12)
('1709', 13)
('1945', 14)
('2', 15)
('3', 16)
('31', 17)
('382', 18)
('4', 19)
(':', 20)
(';', 21)
('?', 22)
('A', 23)
('ALBUS', 24)
('ALLEY', 25)
('ALLOWED', 26)
('AM', 27)
('AND', 28)
('ANYTHING', 29)
('ARE', 30)
('AT', 31)
('About', 32)
('According', 33)
('Adalbert', 34)
('Add', 35)
('Adrian', 36)
('Africa', 37)
('African', 38)
('After', 39)
('Against', 40)
('Ages', 41)
('Agrippa', 42)
('Ah', 43)
('Ahead', 44)
('Alberic', 45)
('Albus', 46)
('Albus…”', 47)
('Algie', 48)
('Alicia', 49)
('All', 50)


WE NEED TO REMEMBER TWO CONCEPTS i.e. ENCODER AND DECODER

ENCODER -> it will take text as input and give token ID as output

DECODER -> it will take token ID as input and will give text as output

In [11]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [12]:
#Now let us create an object of the above class and check if it is returning the ids or not
tokenizer = SimpleTokenizer(vocab)
text =  """It was on the corner of the street that he noticed"""
ids = tokenizer.encode(text)
print(ids)

[593, 6597, 4470, 6131, 2269, 4452, 6131, 5888, 6126, 3488, 4427]


In [13]:
tokenizer.decode(ids)

'It was on the corner of the street that he noticed'

So encoder and decoder is perfectly working as expected

NOW WE HAVE COMPLETED A SIMPLE TOKENIZER OF OUR OWN BUT THERE ARE SOME LIMITATIONS. WE CANNOT USE OTHER TEXT OTHER THAN THE ONE IN THE txt file WE USED MEANING IF I WRITE SOME TEXT THERE WILL BE ERROR SO WE NEED SPECIAL CONTEXT TOKENS WHICH IS ALSO USED BY GPT MODELS .

<|endofText|> and <|unk|> -> unknown will be used

In [None]:
tokens = sorted(list(set(preprocessed)))
tokens.extend(["<|endofText|>", "<|unk|>"])# Here I used extend it will add 2 extra entries see the result it is 7574 but previously it was 7572
vocab = {token:integer for integer, token in enumerate(tokens)}
len(vocab.items())

7574

Let us check 

In [15]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('…Oak', 7569)
('…Then', 7570)
('…”', 7571)
('<|endofText|>', 7572)
('<|unk|>', 7573)


In [16]:
class AdvancedTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

Now let us try again with some random text

In [17]:
tokenizer = AdvancedTokenizer(vocab)

text1 = "Hello, my name is Subhranil Mondal"
text2 = "I am from Kolkata."

text = " <|endofText|> ".join((text1,text2))
print(text)

Hello, my name is Subhranil Mondal <|endofText|> I am from Kolkata.


In [18]:
tokenizer.encode(text)

[7573, 5, 4318, 4332, 3758, 7573, 7573, 7572, 575, 1391, 3164, 7573, 8]

In [19]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, my name is <|unk|> <|unk|> <|endofText|> I am from <|unk|>.'