# Lecture 2: Tokenization

### Regex for tokenization 

In [1]:
import re
import typing

In [2]:
text = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them!\n\nThanks!"
print(text)

Good muffins cost $3.88
in New York.  Please buy me
two of them!

Thanks!


In [3]:
regex = r'[a-zA-z]+'
re.findall(regex, text)

['Good',
 'muffins',
 'cost',
 'in',
 'New',
 'York',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 'Thanks']

In [4]:
regex = r'\w+'
re.findall(regex, text)

['Good',
 'muffins',
 'cost',
 '3',
 '88',
 'in',
 'New',
 'York',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 'Thanks']

In [5]:
regex = r'\S+'
re.findall(regex, text)

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them!',
 'Thanks!']

In [6]:
# Final (pretty good) tokenization
regex = r'\w+|\$[\d\.]+|\S+'
re.findall(regex, text)

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '!',
 'Thanks',
 '!']

### BPE algorithm 

In [7]:
from collections import defaultdict, Counter
# Special subclasses of the "collections" class
# Not primitive types but behave like dictionaries with additional functionality

`defaultdict` is a subclass of the built-in dict. It provides a default value for a nonexistent key, allowing you to avoid `KeyError`.

In [8]:
dd = defaultdict(list)

In [9]:
dd['a'].append(1)
print(dd)

defaultdict(<class 'list'>, {'a': [1]})


In [10]:
cc = Counter()
cc['a'] += 1
print(cc)

Counter({'a': 1})


In [11]:
word = "great"
chars = ' '.join(list(word))
chars

'g r e a t'

In [12]:
class BPE:
    def __init__(self):
        self.vocab = set()
        self.current_corpus = Counter()
        
    def init_vocab(self, corpus: dict):
        """ Count the frequency of each character """
        
        for word, freq in corpus.items():
            # Corpus split into characters 
            self.current_corpus[" ".join(list(word))] += freq 
            
            # Character counts in vocab 
            for char in list(word): 
                self.vocab.add(char)

    def get_stats(self):
        pairs = defaultdict(int)
        for word, freq in self.current_corpus.items():
            toks = word.split(" ") #splits into tokens we've already made 
            for i in range(len(toks) - 1):
                pairs[(toks[i], toks[i + 1])] += freq
        return pairs

    def merge_vocab(self, pair: tuple):
        # Update vocab 
        replacement = ''.join(pair)
        self.vocab.add(replacement)
        
        # Update corpus 
        new_corpus = {}
        bigram = ' '.join(pair) #previous representation 
        for word in self.current_corpus:
            new_word = word.replace(bigram, replacement)
            new_corpus[new_word] = self.current_corpus[word] # update freq
            
        self.current_corpus = new_corpus 

    def token_learner(self, corpus: dict, k: int, verbose=True):
        """
        Here, k is the number of merges
        """
        self.init_vocab(corpus)
        if verbose: 
            print("Step 0")
            print("Vocab: ", self.vocab)
            print("Corpus: ", self.current_corpus)
            print("===="*15)

        for i in range(k):
            pairs = self.get_stats()
            if not pairs: break #empty merges 
            best_pair = max(pairs, key=pairs.get)
            the_count = pairs[best_pair]
            self.merge_vocab(best_pair)
            
            if verbose: 
                print(f"Step {i + 1}: Merged {best_pair} -> {''.join(best_pair)}, Count:{the_count}")
                print("Vocab: ", self.vocab)
                print("Corpus: ", self.current_corpus)
                print("===="*15)
            input("Next? ")

In [13]:
training_corpus = {
    'low!_': 5,
    'lowest!_': 2,
    'newer!_': 6,
    'wider!_': 3,
    'new!_': 2,
}

tokenizer = BPE()

# Run BPE with 10 merge steps
tokenizer.token_learner(training_corpus, 10)

Step 0
Vocab:  {'d', 'i', 's', 'n', 'o', 'w', 'l', 't', 'e', 'r', '_'}
Corpus:  Counter({'n e w e r _': 6, 'l o w _': 5, 'w i d e r _': 3, 'l o w e s t _': 2, 'n e w _': 2})
Step 1: Merged ('e', 'r') -> er, Count:9
Vocab:  {'d', 'er', 'i', 's', 'n', 'o', 'w', 'l', 't', 'e', 'r', '_'}
Corpus:  {'l o w _': 5, 'l o w e s t _': 2, 'n e w er _': 6, 'w i d er _': 3, 'n e w _': 2}
Next? y
Step 2: Merged ('er', '_') -> er_, Count:9
Vocab:  {'d', 'er', 'er_', 'i', 's', 'n', 'o', 'w', 'l', 't', 'e', 'r', '_'}
Corpus:  {'l o w _': 5, 'l o w e s t _': 2, 'n e w er_': 6, 'w i d er_': 3, 'n e w _': 2}
Next? y
Step 3: Merged ('n', 'e') -> ne, Count:8
Vocab:  {'d', 'er', 'er_', 'i', 's', 'n', 'o', 'w', 'l', 't', 'ne', 'e', 'r', '_'}
Corpus:  {'l o w _': 5, 'l o w e s t _': 2, 'ne w er_': 6, 'w i d er_': 3, 'ne w _': 2}
Next? y
Step 4: Merged ('ne', 'w') -> new, Count:8
Vocab:  {'d', 'er', 'er_', 'i', 's', 'n', 'o', 'w', 'l', 'new', 't', 'ne', 'e', 'r', '_'}
Corpus:  {'l o w _': 5, 'l o w e s t _': 2, 

### Demo of GPT-2 tokenizer

In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer (which is similar to GPT-3's tokenizer in approach)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Example text to tokenize
text = "Unhappiness in neuralink's hyper-fast supercomputing"

# Tokenize the text
tokens = tokenizer.encode(text)

# Print the tokens (which are integers)
print(f"Token IDs: {tokens}\n")

# Print the individual tokens and their corresponding words/subwords
tokenized_words = tokenizer.tokenize(text)
print(f"Tokenized Words/Subwords: {tokenized_words}\n")


Token IDs: [3118, 71, 42661, 287, 17019, 676, 338, 8718, 12, 7217, 2208, 785, 48074]

Tokenized Words/Subwords: ['Un', 'h', 'appiness', 'Ġin', 'Ġneural', 'ink', "'s", 'Ġhyper', '-', 'fast', 'Ġsuper', 'com', 'puting']



The special token Ġ before some words (like Ġin, Ġneural) represents a space. This indicates that these tokens appear at the beginning of words after a space in the input text.