<a href="https://colab.research.google.com/github/davidisinta/AI/blob/main/WordPiece_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordPiece tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [3]:
!pip install datasets evaluate transformers[sentencepiece]



In [4]:
corpus = []

with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)

print(corpus)


['United States Court of Appeals', 'FOR THE DISTRICT OF COLUMBIA CIRCUIT', 'Argued November 7, 2000   Decided September 21, 2001', 'No. 00-5016', 'The Honorable John H. McBryde,', 'United States District Judge for the', 'Northern District of Texas,', 'Appellant', 'v.', 'Committee to Review Circuit Council Conduct and', 'Disability Orders of the Judicial Conference', 'of the United States, et al.,', 'Appellees', 'Appeal from the United States District Court', 'for the District of Columbia', '(No. 98cv02457)', 'David Broiles and Arnon D. Siegel argued the cause and', 'filed the briefs for appellant.', 'William B. Schultz, Deputy Assistant Attorney General,', 'U.S. Department of Justice, argued the cause for appellee', 'United States of America.  David W. Ogden, Assistant Attor-', 'ney General, Mark B. Stern and Scott R. McIntosh, Attor-', 'neys, and Wilma A. Lewis, U.S. Attorney at the time the', 'brief was filed, were on the brief. Thomas W. Millet, Attor-', 'ney, U.S. Department of Jus

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

word_freqs

defaultdict(int,
            {'United': 35,
             'States': 35,
             'Court': 51,
             'of': 725,
             'Appeals': 3,
             'FOR': 1,
             'THE': 1,
             'DISTRICT': 1,
             'OF': 1,
             'COLUMBIA': 1,
             'CIRCUIT': 1,
             'Argued': 1,
             'November': 1,
             '7': 7,
             ',': 1088,
             '2000': 7,
             'Decided': 1,
             'September': 5,
             '21': 5,
             '2001': 7,
             'No': 22,
             '.': 1575,
             '00': 1,
             '-': 530,
             '5016': 1,
             'The': 82,
             'Honorable': 2,
             'John': 7,
             'H': 12,
             'McBryde': 183,
             'District': 12,
             'Judge': 203,
             'for': 153,
             'the': 1051,
             'Northern': 9,
             'Texas': 6,
             'Appellant': 1,
             'v': 84,
             'Committ

In [7]:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
alphabet

print(alphabet)

['"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [8]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

print(len(vocab))

136


In [9]:
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

In [10]:
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

In [11]:
pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break

('U', '##n'): 6.0421723145616355e-05
('##n', '##i'): 5.236974780177983e-06
('##i', '##t'): 2.0350499933512214e-05
('##t', '##e'): 1.3292421033714365e-05
('##e', '##d'): 2.16152093767852e-05
('S', '##t'): 1.98626118417836e-05


In [12]:
best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

print(best_pair, max_score)

('##H', '##E') 0.5


In [13]:
# vocab.append("HE")

In [14]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [15]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [16]:
# splits = merge_pair("a", "##b", splits)
# splits["about"]

In [17]:
vocab_size = 5000

print(len(vocab))

while len(vocab) < vocab_size:
    # print("helloo")
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    # print("new token" + new_token)
    vocab.append(new_token)


print(len(vocab))

136
5000


In [18]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '"', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##H', '##I', '##J', '##L', '##M', '##O', '##P', '##R', '##S', '##T', '##U', '##V', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '##HE', '##PM', '##LJ', '##UM', '##LUM', '##OLUM', '##US', '##ST', '##OR', '##STR', '##LR', '##RD', '##ER', '##LRA', '##USA', '##AA', '##CU', '##RCU', '##RDC', '##ERC', '##CT', 'NRDC', 'OF',

In [20]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [21]:
print(encode_word("Hugging"))
print(encode_word("H+gging"))

['Hu', '##gg', '##ing']
['[UNK]']


In [22]:
tokenize("This is a test, this is very nice")

['This',
 'i',
 '##s',
 'a',
 't',
 '##e',
 '##s',
 '##t',
 ',',
 'th',
 '##i',
 '##s',
 'i',
 '##s',
 'v',
 '##e',
 '##r',
 '##y',
 'n',
 '##ic',
 '##e']

In [23]:
with open("court_of_appeal.txt", "r", encoding="utf-8") as file:
    text = file.read()


court_of_appeal_tokens = tokenize(text)

print(court_of_appeal_tokens)


['Unit', '##e', '##d', 'S', '##t', '##a', '##t', '##e', '##s', 'Court', 'of', 'App', '##e', '##al', '##s', 'FOR', 'THE', 'DISTRICT', 'OF', 'COLUMBIA', 'CIRCUIT', 'Argu', '##e', '##d', 'Nov', '##e', '##mb', '##e', '##r', '7', ',', '2000', 'D', '##e', '##cid', '##e', '##d', 'S', '##e', '##p', '##t', '##e', '##mb', '##e', '##r', '21', ',', '2001', 'No', '.', '00', '-', '5016', 'Th', '##e', 'Honorabl', '##e', 'John', 'H', '.', 'McBryd', '##e', ',', 'Unit', '##e', '##d', 'S', '##t', '##a', '##t', '##e', '##s', 'District', 'Judg', '##e', 'for', 'th', '##e', 'No', '##r', '##th', '##e', '##r', '##n', 'District', 'of', 'T', '##e', '##xas', ',', 'App', '##e', '##ll', '##a', '##n', '##t', 'v', '.', 'Committ', '##e', '##e', 'to', 'R', '##e', '##vi', '##e', '##w', 'Circuit', 'Council', 'Conduct', 'and', 'Disability', 'Ord', '##e', '##r', '##s', 'of', 'th', '##e', 'Judicial', 'Conf', '##e', '##r', '##e', '##nc', '##e', 'of', 'th', '##e', 'Unit', '##e', '##d', 'S', '##t', '##a', '##t', '##e', '##s', 

In [24]:
def populate_corpus(file_path):
  corpus = []
  with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        cleaned_line = line.strip()  # Remove leading/trailing whitespace
        if cleaned_line:  # Ignore empty lines
            corpus.append(cleaned_line)
  print(corpus)

  return corpus

In [25]:
def analyze_word_freqs(corpus):
  word_freqs = defaultdict(int)
  for text in corpus:
      words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
      new_words = [word for word, offset in words_with_offsets]
      for word in new_words:
          word_freqs[word] += 1
  print(word_freqs)
  return word_freqs


In [26]:
def generate_alphabet(word_freqs):
  alphabet = []
  for word in word_freqs.keys():
      if word[0] not in alphabet:
          alphabet.append(word[0])
      for letter in word[1:]:
          if f"##{letter}" not in alphabet:
              alphabet.append(f"##{letter}")

  alphabet.sort()
  alphabet
  print(alphabet)
  return alphabet

In [27]:
def generate_vocab(alphabet):
  vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
  print(len(vocab))
  return vocab

In [28]:
def generate_splits(word_freqs):
  splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}
  return splits

In [29]:
def train_model(vocab, vocab_size, splits):

  while len(vocab) < vocab_size:
      # print("helloo")
      scores = compute_pair_scores(splits)
      best_pair, max_score = "", None
      for pair, score in scores.items():
          if max_score is None or max_score < score:
              best_pair = pair
              max_score = score
      splits = merge_pair(*best_pair, splits)
      new_token = (
          best_pair[0] + best_pair[1][2:]
          if best_pair[1].startswith("##")
          else best_pair[0] + best_pair[1]
      )
      print("new token" + new_token)
      vocab.append(new_token)

  return vocab

In [30]:
#training and utilization of wizard of oz
corpus = populate_corpus("wizard_of_oz.txt")
word_freqs = analyze_word_freqs(corpus)
alphabet = generate_alphabet(word_freqs)
vocab = generate_vocab(alphabet)
splits = generate_splits(word_freqs)
train_model(vocab, 5000, splits)

['The Project Gutenberg eBook of The Wonderful Wizard of Oz', 'This ebook is for the use of anyone anywhere in the United States and', 'most other parts of the world at no cost and with almost no restrictions', 'whatsoever. You may copy it, give it away or re-use it under the terms', 'of the Project Gutenberg License included with this ebook or online', 'at www.gutenberg.org. If you are not located in the United States,', 'you will have to check the laws of the country where you are located', 'before using this eBook.', 'Title: The Wonderful Wizard of Oz', 'Author: L. Frank Baum', 'Release date: February 1, 1993 [eBook #55]', 'Most recently updated: December 29, 2024', 'Language: English', '*** START OF THE PROJECT GUTENBERG EBOOK THE WONDERFUL WIZARD OF OZ ***', '[Illustration]', 'The Wonderful Wizard of Oz', 'by L. Frank Baum', 'This book is dedicated to my good friend & comrade', 'My Wife', 'L.F.B.', 'Contents', 'Introduction', 'Chapter I. The Cyclone', 'Chapter II. The Council with

['[PAD]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 '!',
 '#',
 '##0',
 '##1',
 '##2',
 '##3',
 '##4',
 '##5',
 '##6',
 '##7',
 '##8',
 '##9',
 '##A',
 '##B',
 '##C',
 '##D',
 '##E',
 '##F',
 '##G',
 '##H',
 '##I',
 '##J',
 '##K',
 '##L',
 '##M',
 '##N',
 '##O',
 '##P',
 '##Q',
 '##R',
 '##S',
 '##T',
 '##U',
 '##V',
 '##W',
 '##X',
 '##Y',
 '##Z',
 '##a',
 '##b',
 '##c',
 '##d',
 '##e',
 '##f',
 '##g',
 '##h',
 '##i',
 '##j',
 '##k',
 '##l',
 '##m',
 '##n',
 '##o',
 '##p',
 '##q',
 '##r',
 '##s',
 '##t',
 '##u',
 '##v',
 '##w',
 '##x',
 '##y',
 '##z',
 '##™',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 

In [31]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as file:
    text = file.read()

wizard_of_oz_tokens = tokenize(text)

print(wizard_of_oz_tokens)

['Th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'eBook', 'of', 'Th', '##e', 'Wond', '##e', '##rful', 'Wizard', 'of', 'Oz', 'This', 'ebook', 'is', 'for', 'th', '##e', 'us', '##e', 'of', 'anyon', '##e', 'anywh', '##e', '##r', '##e', 'in', 'th', '##e', 'Unit', '##e', '##d', 'Stat', '##e', '##s', 'and', 'most', 'oth', '##e', '##r', 'part', '##s', 'of', 'th', '##e', 'world', 'a', '##t', 'no', 'cost', 'and', 'with', 'almost', 'no', 'r', '##e', '##strictions', 'whatso', '##e', '##v', '##e', '##r', '.', 'You', 'may', 'copy', 'it', ',', 'giv', '##e', 'it', 'away', 'or', 'r', '##e', '-', 'us', '##e', 'it', 'und', '##e', '##r', 'th', '##e', 't', '##e', '##r', '##m', '##s', 'of', 'th', '##e', 'Proj', '##e', '##ct', 'Gut', '##e', '##nb', '##e', '##r', '##g', 'Lic', '##e', '##n', '##s', '##e', 'includ', '##e', '##d', 'with', 'this', 'ebook', 'or', 'onlin', '##e', 'a', '##t', 'www', '.', 'gut', '##e', '##nb', '##e', '##r', '##g', '.', 'org', '.', 'If', 'you', 'a', '##r

In [32]:
print(type(wizard_of_oz_tokens))

<class 'list'>


In [33]:
print(len(wizard_of_oz_tokens))

95456


In [34]:
print(len(court_of_appeal_tokens))

47927


In [35]:
for i in range(10):
  print(wizard_of_oz_tokens[i])

Th
##e
Proj
##e
##ct
Gut
##e
##nb
##e
##r


In [36]:
unique_tokens_oz = set(wizard_of_oz_tokens)
print(len(unique_tokens_oz))

2351


In [37]:
unique_tokens_court = set(court_of_appeal_tokens)
print(len(unique_tokens_court))

2309


In [42]:
common_tokens = unique_tokens_oz & unique_tokens_court
print(f"Tokens in both sets ({len(common_tokens)}):")
for token in common_tokens:
    if i < 100:
        print(token)
        i += 1

Tokens in both sets (621):
surviv
cold
locat
und
opinion
injur
##asonabl
employ
compr
chos
hint
1993
modifi
making
no
discov
suff
W
##port
60
dir
just
##rsion
[
in
harshly
law
##iz
On
woman
impli
each
This
curr
r
##h
##gally
to
Of
##d
Evid
manif
T
possibly
##thing
un
but
##dicat
possibl
##ginning
past
##ith
##quir
##for
V
,
quick
h
##ph
job
public
also
O
fair
echo
##cial
turning
qu
oil
always
hav
Qu
enjoy
Com
30
##stions
minut
moving
family
ag
hours
constant
##b
probl
7
complianc
common
But
My
physical


In [43]:
# Tokens only in Wizard of Oz
only_in_oz = unique_tokens_oz - unique_tokens_court
print(f"\nTokens only in Wizard of Oz ({len(only_in_oz)}):")
for token in only_in_oz:
    print(token)


Tokens only in Wizard of Oz (1730):
ask
sitting
information
tiny
shaggy
donations
glad
swim
MERCHANTABILITY
tip
##rribl
READ
awfully
Call
Making
bank
Room
Poppy
Pow
##distribution
whisp
Fortunat
uniform
succ
inquir
##rmor
basin
gaz
domain
clouds
roar
glow
Em
clapping
corn
sprinkl
wagg
brims
float
DONATIONS
prim
disp
nonpropri
BEFORE
cracks
REFUND
imm
UNDER
Gold
monst
Jok
gross
subscrib
##asuring
glitt
tongu
thos
##ro
trodd
OWNER
said
Mission
misfortun
Thick
wat
wir
plagu
roof
Can
bigg
pac
sulkily
soul
bowl
crowns
flooding
scratch
hug
dust
whoso
gladly
warm
fl
strang
crowd
glist
slipp
gloomy
HAVE
Anyon
awok
countl
grav
EBOOK
royalti
wailing
satisfaction
hall
brightly
accordanc
storm
siz
glar
pigs
gold
astonish
mourn
distanc
awkward
shaking
closing
afr
##xity
rob
Aunt
finish
polishing
poison
laid
gnash
##rything
fairi
scar
Oth
Illustration
answ
promoting
striking
innoc
blow
additions
odd
whirlwinds
cookstov
OTHER
forgiv
education
poss
crown
##rvic
Truly
kill
obtain
small
kitch
scatt
bra

In [40]:
# Tokens only in Court of Appeal
only_in_court = unique_tokens_court - unique_tokens_oz
print(f"\nTokens only in Court of Appeal ({len(only_in_court)}):")
# for token in only_in_court:
#     print(token)


Tokens only in Court of Appeal (1688):
