<a href="https://colab.research.google.com/github/davidisinta/AI/blob/main/WordPiece_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordPiece tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [49]:
!pip install datasets evaluate transformers[sentencepiece]



In [50]:
corpus = []

with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)


In [51]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [52]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

In [53]:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
alphabet

print(alphabet)

['"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [54]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

print(len(vocab))

136


In [55]:
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

In [56]:
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

In [57]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [58]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [59]:
vocab_size = 5000

print(len(vocab))

while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)


print(len(vocab))

136
5000


In [60]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '##HE', '##PM', '##LJ', '##UM', '##LUM', '##OLUM', '##US', '##ST', '##OR', '##STR', '##LR', '##RD', '##ER', '##LRA', '##USA', '##AA', '##CU', '##RCU', '##RDC', '##ERC', '##CT', 'NRDC', 'OF',

In [61]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [62]:
print(encode_word("Hugging"))
print(encode_word("H+gging"))

['Hu', '##gg', '##ing']
['[UNK]']


In [63]:
tokenize("This is a test, this is very nice")

['This',
 'i',
 '##s',
 'a',
 't',
 '##e',
 '##s',
 '##t',
 ',',
 'th',
 '##i',
 '##s',
 'i',
 '##s',
 'v',
 '##e',
 '##r',
 '##y',
 'n',
 '##ic',
 '##e']

In [64]:
with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    text = file.read()


court_of_appeal_tokens = tokenize(text)

i = 0
for token in court_of_appeal_tokens:
    if i < 10:
        print(token)
        i += 1


Unit
##e
##d
S
##t
##a
##t
##e
##s
Court


In [65]:
def populate_corpus(file_path):
  corpus = []
  with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)
  print(corpus)

  return corpus

In [66]:
def analyze_word_freqs(corpus):
  word_freqs = defaultdict(int)
  for text in corpus:
      words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
      new_words = [word for word, offset in words_with_offsets]
      for word in new_words:
          word_freqs[word] += 1
  print(word_freqs)
  return word_freqs


In [67]:
def generate_alphabet(word_freqs):
  alphabet = []
  for word in word_freqs.keys():
      if word[0] not in alphabet:
          alphabet.append(word[0])
      for letter in word[1:]:
          if f"##{letter}" not in alphabet:
              alphabet.append(f"##{letter}")

  alphabet.sort()
  alphabet
  print(alphabet)
  return alphabet

In [68]:
def generate_vocab(alphabet):
  vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
  print(len(vocab))
  return vocab

In [69]:
def generate_splits(word_freqs):
  splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}
  return splits

In [82]:
def train_model(vocab, vocab_size, splits):

  while len(vocab) < vocab_size:
      scores = compute_pair_scores(splits)
      best_pair, max_score = "", None
      for pair, score in scores.items():
          if max_score is None or max_score < score:
              best_pair = pair
              max_score = score
      splits = merge_pair(*best_pair, splits)
      new_token = (
          best_pair[0] + best_pair[1][2:]
          if best_pair[1].startswith("##")
          else best_pair[0] + best_pair[1]
      )
      vocab.append(new_token)

  return vocab

In [83]:
#training and utilization of wizard of oz
corpus = populate_corpus("wizard_of_oz.txt")
word_freqs = analyze_word_freqs(corpus)
alphabet = generate_alphabet(word_freqs)
vocab = generate_vocab(alphabet)
splits = generate_splits(word_freqs)
wizard_oz_model = train_model(vocab, 5000, splits)

['The Project Gutenberg eBook of The Wonderful Wizard of Oz', 'This ebook is for the use of anyone anywhere in the United States and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. You may copy it, give it away or re-use it under the terms', 'of the Project Gutenberg License included with this ebook or online', 'at www.gutenberg.org. If you are not located in the United States,', 'you will have to check the laws of the country where you are located', 'before using this eBook.', 'Title: The Wonderful Wizard of Oz', 'Author: L. Frank Baum', 'Release date: February 1, 1993 [eBook #55]', 'Most recently updated: December 29, 2024', 'Language: English', '*** START OF THE PROJECT GUTENBERG EBOOK THE WONDERFUL WIZARD OF OZ ***', '[Illustration]', 'The Wonderful Wizard of Oz', 'by L. Frank Baum', 'This book is dedicated to my good friend & comrade', 'My Wife', 'L.F.B.', 'Contents', 'Introduction', 'Chapter I. The Cyclone', 'Chapter II. The Council with

In [72]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as file:
    text = file.read()

wizard_of_oz_tokens = tokenize(text)

print(wizard_of_oz_tokens)

['Th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'eBook', 'of', 'Th', '##e', 'Wond', '##e', '##rful', 'Wizard', 'of', 'Oz', 'This', 'ebook', 'is', 'for', 'th', '##e', 'us', '##e', 'of', 'anyon', '##e', 'anywh', '##e', '##r', '##e', 'in', 'th', '##e', 'Unit', '##e', '##d', 'Stat', '##e', '##s', 'and', 'most', 'oth', '##e', '##r', 'part', '##s', 'of', 'th', '##e', 'world', 'a', '##t', 'no', 'cost', 'and', 'with', 'almost', 'no', 'r', '##e', '##strictions', 'whatso', '##e', '##v', '##e', '##r', '.', 'You', 'may', 'copy', 'it', ',', 'giv', '##e', 'it', 'away', 'or', 'r', '##e', '-', 'us', '##e', 'it', 'und', '##e', '##r', 'th', '##e', 't', '##e', '##r', '##m', '##s', 'of', 'th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'Lic', '##e', '##n', '##s', '##e', 'includ', '##e', '##d', 'with', 'this', 'ebook', 'or', 'onlin', '##e', 'a', '##t', 'www', '.', 'gut', '##e', '##nb', '##e', '##r', '##g', '.', 'org', '.', 'If', 'you', 'a', '##r

In [73]:
print(type(wizard_of_oz_tokens))

<class 'list'>


In [74]:
print(len(wizard_of_oz_tokens))

95456


In [75]:
print(len(court_of_appeal_tokens))

47927


In [76]:
for i in range(10):
  print(wizard_of_oz_tokens[i])

Th
##e
Proj
##e
##ct
Gut
##e
##nb
##e
##r


In [77]:
unique_tokens_oz = set(wizard_of_oz_tokens)
print(len(unique_tokens_oz))

2351


In [78]:
unique_tokens_court = set(court_of_appeal_tokens)
print(len(unique_tokens_court))

2309


In [79]:
common_tokens = unique_tokens_oz & unique_tokens_court
print(f"Tokens in both sets ({len(common_tokens)}):")
i = 0
for token in common_tokens:
    if i < 10:
        print(token)
        i += 1

Tokens in both sets (621):
surviv
cold
locat
und
opinion
injur
##asonabl
employ
compr
chos


In [80]:
# Tokens only in Wizard of Oz
only_in_oz = unique_tokens_oz - unique_tokens_court
print(f"\nTokens only in Wizard of Oz ({len(only_in_oz)}):")
# for token in only_in_oz:
#     print(token)


Tokens only in Wizard of Oz (1730):


In [40]:
# Tokens only in Court of Appeal
only_in_court = unique_tokens_court - unique_tokens_oz
print(f"\nTokens only in Court of Appeal ({len(only_in_court)}):")
# for token in only_in_court:
#     print(token)


Tokens only in Court of Appeal (1688):
