In [46]:
import re
def tokenize(text):
    tokens = re.findall(r"\w+|[^\w\s]", text)
    return tokens

In [53]:
def get_pair_frequencies(corpus):
    pairs = {}
    for word in corpus:
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            if pair in pairs:
                pairs[pair] += 1
            else:
                pairs[pair] = 1
    return pairs

def merge_pair(corpus, pair):
    merged_corpus = []
    bigram = ' '.join(pair)
    replacement = ''.join(pair)

    for word in corpus:
        new_word = word.replace(bigram, replacement)
        merged_corpus.append(new_word)
    return merged_corpus

def bpe(corpus, num_iterations):
    corpus = [' '.join(list(word)) for word in corpus]
    vocab = set(' '.join(corpus).split())
    vocab = list(vocab)

    for _ in range(num_iterations):
        pair_frequencies = get_pair_frequencies(corpus)
        if not pair_frequencies:
            break

        most_frequent_pair = max(pair_frequencies, key=pair_frequencies.get)

        corpus = merge_pair(corpus, most_frequent_pair)

        vocab.append(''.join(most_frequent_pair))
    return vocab

In [54]:
def levenshtein_distance(s1, s2, m, n):
    if m == 0:
        return n

    if n == 0:
        return m

    if s1[m - 1] == s2[n - 1]:
        return levenshtein_distance(s1, s2, m - 1, n - 1)

    return 1 + min(levenshtein_distance(s1, s2, m, n - 1),
                   levenshtein_distance(s1, s2, m - 1, n),
                   levenshtein_distance(s1, s2, m - 1, n - 1))
def editDist(s1, s2):
    return levenshtein_distance(s1, s2, len(s1), len(s2))

In [55]:
def pipeline(text):
    tokens = tokenize(text)
    vocab = bpe(tokens,5)
    return tokens, vocab

In [58]:
text = "London is the capital and most populous city of England and the United Kingdom."
print(pipeline(text))
print(editDist("London", "Londinium"))

(['London', 'is', 'the', 'capital', 'and', 'most', 'populous', 'city', 'of', 'England', 'and', 'the', 'United', 'Kingdom', '.'], ['.', 'n', 'U', 'i', 'l', 'u', 't', 'o', 'd', 'g', 'm', 'p', 'c', 'E', 'a', 's', 'L', 'h', 'e', 'K', 'f', 'y', 'nd', 'it', 'and', 'th', 'the'])
4
