In [64]:
import tokenizer as tok_tests
import re
from collections import Counter
from collections import deque
from collections import defaultdict
import itertools

In [41]:
def corpus_common_tokens_counts(list_of_strings, n):
    tokens = []
    for s in list_of_strings:
        tokens += re.findall(r"\w+|[^\w\s]", s)
    
    return Counter(tokens).most_common(n)

def corpus_common_tokens(list_of_strings):
    counts = corpus_common_tokens_counts(list_of_strings, 30000)
    return [t[0] for t in counts]

In [42]:
corpus_common_tokens(['test hello', 'hello', 'foo bar baz'])

['hello', 'test', 'foo', 'bar', 'baz']

In [43]:
tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

In [51]:
class Tokenizer():
    def __init__(self, token_list):
        self.token_list = token_list
        self.token_dict = {}  #string to int
        for d in token_list:
            self.token_dict[d["piece"]] = d["id"]
        self.id_dict  = {v:k for k, v in self.token_dict.items()} #int to string
        
    def decode(self, ids):
        words = [self.id_dict.get(i, "[UNK]") for i in ids]
        return "".join(words)
    
    def tokenize(self, string):
        words = re.findall(r"\w+|[^\w\s]", string)
        tokens = [self.token_dict.get(w, 3) for w in words]
        return tokens
    
tok_tests.test_tokenizer(Tokenizer)

In [90]:
def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

class BPETokenizer(Tokenizer):
    def __init__(self):
        pass
        
    def tokenize(self, string, vocab, num_merges=1):
        tokens = deque([c for c in string])
        for _ in range(num_merges):
            final = deque()
            last1 = tokens.popleft()
            while len(tokens) > 0:
                last2 = tokens.popleft()
                if last1 + last2 in vocab:
                    final.append(last1 + last2)
                    if len(tokens) > 0:
                        last1 = tokens.popleft()
                    else:
                        last1 = None
                else:
                    final.append(last1)
                    last1 = last2
            if last1:
                final.append(last1)
            tokens = final
        return tokens
    
    def from_corpus(self, corpus_filename, num_merges=2, num_lines=None):
        f = open(corpus_filename, "r")
        lines = f.readlines()
        if num_lines:
            lines = lines[:num_lines]
        f.close()
        text = " ".join(lines)
        tokens = [t for t in text]
        
        vocab = set(tokens)
        
        for i in range(num_merges):
            final = deque()
            n_seq = defaultdict(int)
            pairs = ["".join(p) for p in pairwise(tokens) if " " not in p and "\n" not in p]
            counts = Counter(pairs)
            vocab.add(counts.most_common(1)[0][0])
            tokens = self.tokenize(text, vocab, i)
            
        return vocab            
        
    
# t = BPETokenizer(["aa", "bb", "c", "dd", "d", "ddd"], 3)

# t.tokenize("aabbcccccccdddddabcaabb")

t = BPETokenizer()
t.from_corpus('shakespeare.txt', 100, 500)

{'\n',
 ' ',
 '!',
 '"',
 '#',
 '%',
 "'",
 "'s",
 '(',
 ')',
 '*',
 '**',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'AN',
 'AR',
 'B',
 'C',
 'CO',
 'D',
 'E',
 'ER',
 'ES',
 'F',
 'G',
 'H',
 'I',
 'IN',
 'IS',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'ON',
 'OR',
 'P',
 'Pro',
 'Q',
 'R',
 'S',
 'T',
 'TE',
 'TH',
 'Th',
 'U',
 'V',
 'W',
 'Wh',
 'X',
 'Y',
 '[',
 ']',
 '_',
 'a',
 'al',
 'all',
 'an',
 'and',
 'ar',
 'at',
 'aut',
 'b',
 'be',
 'by',
 'c',
 'ce',
 'ch',
 'co',
 'ct',
 'd',
 'de',
 'do',
 'e',
 'e,',
 'ect',
 'ed',
 'en',
 'er',
 'es',
 'et',
 'etext',
 'ext',
 'f',
 'for',
 'g',
 'gh',
 'ght',
 'h',
 'ha',
 'i',
 'ill',
 'in',
 'ing',
 'ion',
 'ir',
 'is',
 'it',
 'ith',
 'ive',
 'j',
 'ject',
 'k',
 'l',
 'la',
 'ld',
 'le',
 'li',
 'll',
 'lo',
 'm',
 'ma',
 'me',
 'n',
 'nd',
 'ne',
 'no',
 'nt',
 'o',
 'of',
 'on',
 'or',
 'ou',
 'p',
 'pro',
 'q',
 'r',
 'ra',
 're',
 'rea',