In [3]:
with open(r"./data/the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
    
print(f"total num of characters",len(raw_text))
display(raw_text[:100])

total num of characters 20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [4]:
import re

result=re.split(r"(\s)",raw_text) #this split will include the whitespace and punctuations and characters
result[:10]

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ']

In [5]:
result=re.split(r"([,.]|\s)",raw_text) #this split will include the commas, periods and white spaces and punctuations and characters
result[:10]

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ']

In [6]:
# removing whitespaces preserves the structure of the text, if this is important then you have to keep it , for coding space matters so we need to keep the spaces
result=[item for item in result if item.strip()]
result

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius--though',
 'a',
 'good',
 'fellow',
 'enough--so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in',
 'the',
 'height',
 'of',
 'his',
 'glory',
 ',',
 'he',
 'had',
 'dropped',
 'his',
 'painting',
 ',',
 'married',
 'a',
 'rich',
 'widow',
 ',',
 'and',
 'established',
 'himself',
 'in',
 'a',
 'villa',
 'on',
 'the',
 'Riviera',
 '.',
 '(Though',
 'I',
 'rather',
 'thought',
 'it',
 'would',
 'have',
 'been',
 'Rome',
 'or',
 'Florence',
 '.',
 ')',
 '"The',
 'height',
 'of',
 'his',
 'glory"--that',
 'was',
 'what',
 'the',
 'women',
 'called',
 'it',
 '.',
 'I',
 'can',
 'hear',
 'Mrs',
 '.',
 'Gideon',
 'Thwing--his',
 'last',
 'Chicago',
 'sitter--deploring',
 'his',
 'unaccountable',
 'abdication',
 '.',
 '"Of',
 'course',
 "it's",
 'going',
 'to',
 'send',
 'the',
 'value',
 'of',
 'my',
 'picture',
 "'way",
 'up;',
 'but',
 'I',
 "don't"

In [7]:
# final splitter
# raw_text="Hellom world. IS this a -- test?!"

preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',raw_text) # this will  split eveything better
preprocessed=[item.strip() for item in preprocessed if item.strip()]
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [8]:
# create ordered list of unique words
all_words=sorted(set(preprocessed))
print(all_words[:10])
vocab_size=len(all_words)
print(f"lennght of words",vocab_size)


#map words to integers and store in a dicitonary
vocab={token:integer for integer, token in enumerate(all_words)}


#printing some vocabulary
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break



['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']
lennght of words 1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [9]:
# based on vocab it will encode and decode the text , if the word is not in the vcabulary it will give an error
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
        
    def encode(self,text):
        preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',text)        
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text
    
    
tokenizer=SimpleTokenizerV1(vocab)

text ="what is the of meaning life? why do exist.," 

ids=tokenizer.encode(text)
print("encoding", ids)
        
print(f"decoding {tokenizer.decode(ids)}")



KeyError: 'meaning'

In [10]:
#fixing the bug for new words in the simple tokenizer

all_tokens=sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab={token:integer for integer,token in enumerate(all_tokens)}
len(vocab.items())

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)



('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [11]:
# updating the simple tokenizer V1

class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
        
    def encode(self,text):
        preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',text)        
        preprocessed=[item.strip() for item in preprocessed if item.strip()]
        preprocessed=[item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text= re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text
    
    
tokenizer=SimpleTokenizerV2(vocab)

text1 ="what is the of meaning life? why do exist.," 
text2= "maybe we exist to be better, and make everyone better"

text="<|endoftext|>".join((text1,text2))

print(text)

ids=tokenizer.encode(text)
print("encoding: ", ids)
        
print(f"decoding: {tokenizer.decode(ids)}")



what is the of meaning life? why do exist.,<|endoftext|>maybe we exist to be better, and make everyone better
encoding:  [1089, 584, 988, 722, 1131, 622, 10, 1100, 355, 1131, 7, 5, 1131, 1131, 1131, 1016, 198, 217, 5, 157, 655, 1131, 217]
decoding: what is the of <|unk|> life? why do <|unk|>., <|unk|> <|unk|> <|unk|> to be better, and make <|unk|> better


In [12]:
#chatgpt does not use special tokens like [BOS], [PAD], [UNK].It mostly only used [EOF]
#Chatgpt uses byte pare encoding for unknown tokens

# Byte pair encoding

In [None]:
# !pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable


In [13]:
import importlib
import tiktoken

tokenizer=tiktoken.get_encoding("gpt2")

In [14]:
text

'what is the of meaning life? why do exist.,<|endoftext|>maybe we exist to be better, and make everyone better'

In [16]:
text=(text)
print(text)
integers=tokenizer.encode(text,allowed_special={"<|endoftext|>"})

print(integers)

what is the of meaning life? why do exist.,<|endoftext|>maybe we exist to be better, and make everyone better
[10919, 318, 262, 286, 3616, 1204, 30, 1521, 466, 2152, 1539, 50256, 25991, 356, 2152, 284, 307, 1365, 11, 290, 787, 2506, 1365]


In [18]:
strings=tokenizer.decode(integers)
print(strings)

what is the of meaning life? why do exist.,<|endoftext|>maybe we exist to be better, and make everyone better


In [19]:
integers=tokenizer.encode("asdgas jas",allowed_special={"<|endoftext|>"})

print(integers)

strings=tokenizer.decode(integers)
print(strings)

[292, 67, 22649, 474, 292]
asdgas jas
