# Bottom-up Tokenization

## Develop an algorithm

### Import libraries

In [12]:
import json
import random
import re
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
nltk.download('punkt_tab')

from tokenizers import BertWordPieceTokenizer
import os, json, glob
from collections import defaultdict

# Set random seed for reproducibility
random.seed(42)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/davepipon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Load sample data

In [13]:
# Set the path to your dataset directory
path = os.path.expanduser("/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/*.json")

# Load training file names
files = glob.glob(path)

# Randomly select 2000 files
sample_files = random.sample(files, min(2000, len(files)))

# Read and store randomly sampled documents
sample_data = []
for path in sample_files:
    with open(path) as f:
        data = json.load(f)
        for entry in data:
            sample_data.append(entry.get("text", ""))

print("Number of sampled JSON files:", len(sample_files))
print("Number of texts stored:", len(sample_data))
print("Snippet of first doc:\n", sample_data[0][:300], "...")

Number of sampled JSON files: 2000
Number of texts stored: 37728
Snippet of first doc:
 alzheimer's disease and other types of dementia are the top cause for disabilities in later life and various types of experiments have been performed to understand the underlying mechanisms of the disease with the aim of coming up with potential drug targets. these experiments have been carried out  ...


### Pre-processing

#### Segment to sentences

In [14]:
# Segment to sentences
all_sentences = []
for doc in sample_data:
    sents = sent_tokenize(doc)
    all_sentences.extend(sents)

# Ensure there are no empty sentences
all_sentences = [sent for sent in all_sentences if len(sent) > 0]

print("Number of sentences:", len(all_sentences))

Number of sentences: 599606


#### Generate corpus

In [15]:
# Simple regex tokenizer (split on non-alphabetic chars)
tokens = []
for doc in sample_data:
    words = re.findall(r"\b\w+\b", doc.lower())  # lowercase for consistency
    tokens.extend(words)

# Remove tokens that has special characters or numbers
tokens = re.findall(r'[a-zA-Z]+', ' '.join(tokens))

# Generate unique tokens with frequency
token_freq = Counter(tokens)

# Sort tokens by frequency
corpus = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)

print("Number of unique tokens:", len(token_freq))
print("Top 10 tokens by frequency:", corpus[:10])

Number of unique tokens: 115330
Top 10 tokens by frequency: [('the', 863626), ('of', 503701), ('and', 454964), ('in', 362744), ('to', 316623), ('a', 234964), ('for', 172715), ('is', 135841), ('that', 131452), ('with', 113159)]


#### Create initial vocabulary

In [16]:
# Extract unique list of letters from corpus as initial vocab
unique_letters = re.sub(r'[^a-zA-Z\s]', '', ''.join(tokens))
vocab = list(set(unique_letters))

print("Initial vocab size:", len(vocab))
print("Initial vocab:", vocab)

Initial vocab size: 26
Initial vocab: ['e', 'y', 'u', 'v', 'd', 'p', 'h', 'o', 'n', 'b', 'l', 't', 'k', 'x', 'w', 's', 'i', 'g', 'f', 'z', 'a', 'r', 'j', 'c', 'm', 'q']


### Implement bottom-up tokenization

#### Byte-pairing encoding

In [17]:
# Define function to count frequency of adjacent symbol pairs
def get_pair_stats(corpus):
    pair_freq = defaultdict(int)
    for symbols, freq in corpus:
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pair_freq[pair] += freq
    return pair_freq

# Define function to merge the most frequent pair in the corpus
def merge_pair(pair, corpus):
    a, b = pair
    new_corpus = []
    for symbols, freq in corpus:
        new_syms = []
        i = 0
        while i < len(symbols):
            if i < len(symbols)-1 and symbols[i] == a and symbols[i+1] == b:
                new_syms.append(a+b)   # merge
                i += 2
            else:
                new_syms.append(symbols[i])
                i += 1
        new_corpus.append((new_syms, freq))
    return new_corpus

# Main function to perform byte-pair encoding
def byte_pair_encoding(corpus, vocab, num_merges=100):
    """
    corpus: list of (word, freq) where word is a string.
    vocab: initial vocab.
    """
    # initialize corpus as list of (symbols, freq)
    corpus = [(list(word), freq) for word, freq in corpus]

    for _ in range(num_merges):
        pair_freq = get_pair_stats(corpus)
        if not pair_freq:
            break
        best_pair = max(pair_freq, key=pair_freq.get)   # most frequent
        vocab.append(''.join(best_pair))                # add to vocab
        corpus = merge_pair(best_pair, corpus)          # update corpus

    return vocab, corpus

In [18]:
# Implement BPE
num_merges = 1000
expanded_vocab, final_corpus = byte_pair_encoding(corpus, vocab, num_merges=num_merges)
print("BPE vocab size:", len(expanded_vocab))
print("BPE vocab snippet:", expanded_vocab[:50])
print(expanded_vocab[:-50])

BPE vocab size: 1026
BPE vocab snippet: ['e', 'y', 'u', 'v', 'd', 'p', 'h', 'o', 'n', 'b', 'l', 't', 'k', 'x', 'w', 's', 'i', 'g', 'f', 'z', 'a', 'r', 'j', 'c', 'm', 'q', 'th', 'in', 're', 'the', 'on', 'at', 'an', 'er', 'en', 'al', 'st', 'or', 'ed', 'es', 'ion', 'of', 'and', 'ar', 'as', 'ic', 'it', 'ro', 'ing', 'is']
['e', 'y', 'u', 'v', 'd', 'p', 'h', 'o', 'n', 'b', 'l', 't', 'k', 'x', 'w', 's', 'i', 'g', 'f', 'z', 'a', 'r', 'j', 'c', 'm', 'q', 'th', 'in', 're', 'the', 'on', 'at', 'an', 'er', 'en', 'al', 'st', 'or', 'ed', 'es', 'ion', 'of', 'and', 'ar', 'as', 'ic', 'it', 'ro', 'ing', 'is', 'ent', 'to', 'le', 'ch', 'ct', 'co', 'se', 've', 'ation', 'de', 'for', 'we', 'im', 'ly', 'ou', 'su', 'be', 'lo', 'ig', 'ce', 'ra', 'con', 'il', 'me', 'pro', 'ab', 'ol', 'di', 'res', 'ge', 'are', 'mp', 'te', 'un', 'mo', 'wi', 'ad', 'ver', 'ul', 'with', 'that', 'ment', 'wh', 'us', 'ci', 'ud', 'ter', 'ate', 'fe', 'ur', 'per', 'id', 'no', 'ex', 'ma', 'po', 'ity', 'ir', 'ive', 'pe', 'ac', 'ut', 'so', 'cl

# Compare with HuggingFace WordPiece

## Tokenization using WordPiece

In [20]:
# Define a generator to yield single-token sentences for WordPiece tokenizer
def corpus_iterator(corpus):
    for word, freq in corpus:
        for _ in range(freq):
            yield [word]  # single-token sentence

# Initialize tokenizer
tokenizer = BertWordPieceTokenizer(lowercase=True)
tokenizer.train_from_iterator(
    corpus_iterator(corpus),
    vocab_size=1000,  # Set desired vocab size
    min_frequency=1,  # Minimum frequency for a token to be included
)

print("WordPiece vocab size:", tokenizer.get_vocab_size())
print("WordPiece vocab snippet:", list(tokenizer.get_vocab().items())[:50])
print(list(tokenizer.get_vocab().items())[-50:])




WordPiece vocab size: 1000
WordPiece vocab snippet: [('pri', 728), ('##get', 842), ('through', 722), ('the', 60), ('patients', 956), ('trans', 535), ('wo', 564), ('nor', 721), ('pres', 504), ('##eld', 775), ('##ult', 481), ('new', 683), ('##tiv', 411), ('##ci', 816), ('##ly', 101), ('conf', 974), ('some', 516), ('##ub', 491), ('distrib', 885), ('period', 892), ('##vel', 174), ('##onom', 837), ('data', 206), ('pl', 413), ('total', 739), ('##ind', 399), ('##ame', 613), ('##ro', 76), ('scores', 953), ('aff', 851), ('go', 598), ('ro', 650), ('school', 208), ('##ilar', 698), ('b', 6), ('##ul', 98), ('perform', 595), ('me', 239), ('how', 381), ('##ency', 803), ('i', 13), ('##ass', 287), ('##ith', 117), ('with', 123), ('land', 776), ('cr', 690), ('##ss', 418), ('##ases', 544), ('effects', 691), ('character', 727)]
[('y', 29), ('model', 281), ('some', 516), ('percent', 386), ('##ens', 294), ('sign', 365), ('bet', 269), ('##ol', 103), ('##uch', 372), ('rates', 906), ('provide', 928), ('colle

# Conclusion

Comparing the developed BPE and WordPiece tokenizer, the difference in the token size relies on the hyperparamaters. In particular, vocab size for BPE tokenizer is dependent on the number of merges k while WordPiece tokenizer is dependent on the vocab size. Notably, the vocabulary from the token learner at lower number of merges or vocab size. The learner should have large iterations to give enough space to cover most frequent words. In this exercise, most of the vocab composition are characters and subwords.