In [34]:
import re
with open("data/The_Verdict.txt", "r") as f:
    raw_text = f.read()

raw_text = raw_text[98::]
raw_text = re.sub(r"(\s+)"," ", raw_text)
print("Number of characters {}", len(raw_text))
print(raw_text[:100])

Number of characters {} 21523
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [35]:
# Naive splitting
text = "Hello, world. This, is a--test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a--test.']


In [80]:
text="Hello, world. Is this-- a test?"
result = re.split(r'([\(\),.\s;:>_!]|--)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test?']


In [102]:
def basic_tokenizer(input: str)->list[str]:
    result = re.split(r'([,.;:?_!""()\']|--|\s)', input)
    result = [item.strip() for item in result if item.strip()]
    return result

In [103]:
basic_tokenizer("this is a simple test?")

['this', 'is', 'a', 'simple', 'test', '?']

In [104]:
len(basic_tokenizer(raw_text))


4903

In [105]:
basic_tokenizer(raw_text)[0:30]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

In [106]:
preprocessed = basic_tokenizer(raw_text)
all_words = sorted(set(preprocessed))
len(all_words)

1209

In [107]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [112]:
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '--': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '1930': 11,
 '4': 12,
 ':': 13,
 ';': 14,
 '?': 15,
 'A': 16,
 'About': 17,
 'Ah': 18,
 'Among': 19,
 'And': 20,
 'Are': 21,
 'Arrt': 22,
 'As': 23,
 'At': 24,
 'Attribution-ShareAlike': 25,
 'Be': 26,
 'Begin': 27,
 'Burlington': 28,
 'But': 29,
 'By': 30,
 'Carlo': 31,
 'Chicago': 32,
 'Claude': 33,
 'Come': 34,
 'Commons': 35,
 'Creative': 36,
 'Croft': 37,
 'Destroyed': 38,
 'Devonshire': 39,
 'Don': 40,
 'Dubarry': 41,
 'During': 42,
 'Emperors': 43,
 'FDL': 44,
 'Florence': 45,
 'For': 46,
 'GNU': 47,
 'Gallery': 48,
 'Gideon': 49,
 'Gisburn': 50,
 'Gisburns': 51,
 'Grafton': 52,
 'Greek': 53,
 'Grindle': 54,
 'Grindles': 55,
 'HAD': 56,
 'Had': 57,
 'Hang': 58,
 'Has': 59,
 'He': 60,
 'Her': 61,
 'Hermia': 62,
 'His': 63,
 'How': 64,
 'I': 65,
 'If': 66,
 'In': 67,
 'It': 68,
 'Jack': 69,
 'January': 70,
 'Jove': 71,
 'Just': 72,
 'Lord': 73,
 'Made': 74,
 'Miss': 75,
 'Money': 

In [127]:


class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {value:key for key,value in vocab.items()}

    def encode(self, text):
        result = re.split(r'([,.;:?_!""()\']|--|\s)', text)
        result = [item.strip() for item in result if item.strip()]
        tokens = [self.str_to_int[word] for word in result]
        return tokens

    def decode(self, ids):
        text = " ".join(self.int_to_str[token] for token in ids)
        text = re.sub(r'\s+([,.?!"()\'])'  , r'\1', text)
        return text
      

        

In [137]:
test = "a cheap genius is he !"
tokenizer_v1 = SimpleTokenizerV1(vocab=vocab)
tokenizer_v1.encode(test)

[134, 281, 533, 632, 580, 0]

In [138]:
test = "a cheap genius is he!"
result = tokenizer_v1.decode(tokenizer_v1.encode(test))
assert test == result

In [139]:
test = "It' s the last he painted, you know"
result = tokenizer_v1.decode(tokenizer_v1.encode(test))
assert test == result