# Implemeting Byte-Pair Encoding (BPE) 

In [None]:
import nltk
from collections import Counter

class BPETokenizer:
  def __init__(self, text, vocab_size):
    self.vocab_size = vocab_size
    self.create_initial_vocab(text)

  def create_initial_vocab(self, text):
    text = text.replace(" ", "#")
    text = list(text)
    vocab = list(set(text))

    text_to_id = {vocab[i]: i for i in range(len(vocab))}
    id_to_text = {i: vocab[i] for i in range(len(vocab))}

    self.text_to_id = text_to_id
    self.id_to_text = id_to_text
    self.ids_to_merge = [text_to_id[i] for i in text]

  def merge(self, ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
      if ids[i] == pair[0] and i < len(ids) and ids[i + 1] == pair[1]:
        newids.append(idx)
        i += 2
      else:
        newids.append(ids[i])
        i += 1
    return newids

  def train(self):
    merges = {}
    num_merges = self.vocab_size - len(self.text_to_id)

    for i in range(num_merges):
      bigrams = nltk.ngrams(self.ids_to_merge, 2)
      within_word_bigrams = [i for i in bigrams if '#' not in self.id_to_text[i[0]]]
      counts = Counter(within_word_bigrams)
      candidate = counts.most_common(1)[0][0]
      new_id = len(self.id_to_text) + 1
      self.ids_to_merge = self.merge(self.ids_to_merge, candidate, new_id)
      merges[candidate] = new_id
      self.id_to_text[new_id] = self.id_to_text[candidate[0]] + self.id_to_text[candidate[1]]
      self.text_to_id[self.id_to_text[candidate[0]] + self.id_to_text[candidate[1]]] = new_id

    self.merges = merges
    print('Training Complete!')

  def decode(self, ids):
    return[self.id_to_text[i] for i in ids]

  def encode(self, text):
    text = text.replace(" ", "#")
    text = list(text)
    text.append('#')
    ids = [self.text_to_id[i] for i in text]
    while len(ids) >= 2:
      stats = nltk.ngrams(ids, 2)
      pair = min(stats, key=lambda i: self.merges.get(i, float("inf")))
      if pair not in self.merges:
        break
      idx = self.merges[pair]
      ids = self.merge(ids, pair, idx)

    return ids

In [None]:
tokenizer = BPETokenizer(clean_book, 500)
tokenizer.train()

In [None]:
tokens = tokenizer.encode('Romeo, my love.')
print(tokens)
print(tokenizer.decode(list(tokens)))

In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!") # here they have added a SOS marker instead of an EOS marker.


In [None]:
trainer = trainers.BpeTrainer(vocab_size=500, special_tokens=["<|endoftext|>"])

In [None]:
# Annoyingly, the tokenizers library requires text files as input for training, so let's save our text in a file:
with open('clean_book.txt', 'w') as outfile:
  outfile.write(clean_book)

In [None]:
tokenizer.train(["clean_book.txt"], trainer=trainer) # train!

In [None]:
encoding = tokenizer.encode("Romeo, my love.")
print(encoding.tokens)