# Play Tokenizer

Write a BPE tokenizer in Python without worrying about performance.

I started to look at the tokenizer. I found [this](https://huggingface.co/learn/llm-course/en/chapter6/5) intro to byte-pair encoding tokenization on Hugging Face and read enough to get the idea.

## Determine tokens

In [112]:
corpus = "The batat and the cat fought over the hat."

### Figure out ideas and helper functions

In [113]:
words = corpus.split(' '); words

['The', 'batat', 'and', 'the', 'cat', 'fought', 'over', 'the', 'hat.']

In [114]:
words = [list(word) for word in words]; words

[['T', 'h', 'e'],
 ['b', 'a', 't', 'a', 't'],
 ['a', 'n', 'd'],
 ['t', 'h', 'e'],
 ['c', 'a', 't'],
 ['f', 'o', 'u', 'g', 'h', 't'],
 ['o', 'v', 'e', 'r'],
 ['t', 'h', 'e'],
 ['h', 'a', 't', '.']]

In [115]:
tokens = set(t for word in words for t in word); tokens

{'.',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'n',
 'o',
 'r',
 't',
 'u',
 'v'}

In [116]:
def generate_pairs():
    pairs = {}
    for word in words:
        for i in range(len(word)-1):
            pair = (word[i], word[i+1])
            if pair in pairs:
                pairs[pair] += 1
            else:
                pairs[pair] = 1
    return pairs

In [117]:
def find_top_pair(pairs):
    top_frequency = 0
    candidate_pair = None
    for pair, frequency in pairs.items():
        if (frequency > top_frequency):
            top_frequency = frequency
            candidate_pair = pair
    return candidate_pair

In [118]:
pairs = generate_pairs(); pairs

{('T', 'h'): 1,
 ('h', 'e'): 3,
 ('b', 'a'): 1,
 ('a', 't'): 4,
 ('t', 'a'): 1,
 ('a', 'n'): 1,
 ('n', 'd'): 1,
 ('t', 'h'): 2,
 ('c', 'a'): 1,
 ('f', 'o'): 1,
 ('o', 'u'): 1,
 ('u', 'g'): 1,
 ('g', 'h'): 1,
 ('h', 't'): 1,
 ('o', 'v'): 1,
 ('v', 'e'): 1,
 ('e', 'r'): 1,
 ('h', 'a'): 1,
 ('t', '.'): 1}

In [119]:
pair = find_top_pair(pairs); pair

('a', 't')

In [120]:
new_token = ''.join(pair); new_token

'at'

In [121]:
def update_words_with_new_token(new_pair, corresponding_new_token):
    for word_index in range(len(words)):
        i = 0
        while(i < len(words[word_index])-1):
            word = words[word_index]
            if (word[i], word[i+1]) == new_pair:
                words[word_index] = word[0:i] + [corresponding_new_token] + word[i+2:]
            i += 1        

In [122]:
update_words_with_new_token(pair, new_token)

In [123]:
words

[['T', 'h', 'e'],
 ['b', 'at', 'at'],
 ['a', 'n', 'd'],
 ['t', 'h', 'e'],
 ['c', 'at'],
 ['f', 'o', 'u', 'g', 'h', 't'],
 ['o', 'v', 'e', 'r'],
 ['t', 'h', 'e'],
 ['h', 'at', '.']]

### Put it all together

In [192]:
N_TOKENS = 20
words = corpus.split(' ')
words = [list(word) for word in words]
tokens = set(t for word in words for t in word)
while(len(tokens) < N_TOKENS):
    pairs = generate_pairs()
    pair = find_top_pair(pairs)
    new_token = ''.join(pair)
    tokens.add(new_token)
    update_words_with_new_token(pair, new_token)
tokens.add('<unk>')
tokens.add(' ')
token_to_id = {t:i for i, t in enumerate(tokens)}
id_to_token = {i:t for t, i in token_to_id.items()}

In [193]:
words

[['The'],
 ['b', 'at', 'at'],
 ['a', 'n', 'd'],
 ['the'],
 ['c', 'at'],
 ['f', 'o', 'u', 'g', 'h', 't'],
 ['o', 'v', 'e', 'r'],
 ['the'],
 ['h', 'at', '.']]

In [194]:
tokens

{' ',
 '.',
 '<unk>',
 'T',
 'The',
 'a',
 'at',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'he',
 'n',
 'o',
 'r',
 't',
 'the',
 'u',
 'v'}

In [195]:
token_to_id

{'d': 0,
 'The': 1,
 'h': 2,
 'n': 3,
 ' ': 4,
 'o': 5,
 'g': 6,
 'at': 7,
 'e': 8,
 'v': 9,
 't': 10,
 'a': 11,
 'f': 12,
 'u': 13,
 'he': 14,
 '.': 15,
 'r': 16,
 '<unk>': 17,
 'b': 18,
 'c': 19,
 'the': 20,
 'T': 21}

In [196]:
id_to_token

{0: 'd',
 1: 'The',
 2: 'h',
 3: 'n',
 4: ' ',
 5: 'o',
 6: 'g',
 7: 'at',
 8: 'e',
 9: 'v',
 10: 't',
 11: 'a',
 12: 'f',
 13: 'u',
 14: 'he',
 15: '.',
 16: 'r',
 17: '<unk>',
 18: 'b',
 19: 'c',
 20: 'the',
 21: 'T'}

## Tokenize

In [197]:
sentence = "The cat found the hat."

In [199]:
def encode_word(word):
    encoded = []
    unencoded_part = word
    while(len(unencoded_part) > 0):
        token_id = None
        i = len(unencoded_part)
        while (token_id is None and i > 0):
            token_id = token_to_id.get(unencoded_part[0:i])
            i -= 1
        if(token_id is None):
            token_id = token_to_id['<unk>']
        encoded.append(token_id)
        unencoded_part = unencoded_part[i+1:]
    return encoded

In [200]:
encode_word('bat')

[18, 7]

In [201]:
encode_word('batat')

[18, 7, 7]

In [202]:
encode_word('the')

[20]

In [203]:
encode_word('bz')

[18, 17]

In [204]:
encode_word('zw')

[17, 17]

In [218]:
def encode(sentence):
    encoded = []
    words = sentence.split(' ')
    for word in words:
        encoded += encode_word(word)
        encoded += [token_to_id[' ']]
    encoded = encoded[:-1] # drop the last space, u
    return encoded

In [221]:
encoded = encode(sentence); encoded

[1, 4, 19, 7, 4, 12, 5, 13, 3, 0, 4, 20, 4, 2, 7, 15]

In [222]:
def decode(encoded_sentence):
    return ''.join(map(lambda token_id: id_to_token[token_id], encoded_sentence))

In [223]:
decode(encoded)

'The cat found the hat.'

In [224]:
decode(encode('The zebra lost the hat.'))

'The <unk>ebra <unk>o<unk>t the hat.'

## Questions along the way

-Do spaces or other word breakers get included in the words and encoded as usual? (Not like what I'm doing above.) That seems cleaner.