# Chapter 3 Fundamentals of Large Language Models

## 3.6 Tokenization

### 3.6.1 Byte Pair Encoding

In [1]:
# Words and their frequencies
word_freqs = {
    "たのしい": 6,  # たのしい (fun)
    "たのしさ": 2,  # たのしさ (fun-ness)
    "うつくしい": 4,  # うつくしい (beautiful)
    "うつくしさ": 1,  # うつくしさ (beauty)
}
# Initialize vocabulary with characters
vocab = sorted(set([char for word in word_freqs for char in word]))
# Words and their split states
splits = {word: [char for char in word] for word in word_freqs}

In [2]:
from collections import Counter

def compute_most_frequent_pair(
    splits: dict[str, list[str]]
) -> tuple[str, str]:
    """
    Compute the most frequently occurring adjacent subword pair
    """
    pair_freqs = Counter()  # Counter for pairs of subwords
    for word, freq in word_freqs.items():  # Process all words
        split = splits[word]  # Get the current split state of the word
        # Process all adjacent pairs of subwords
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            # Add the word's frequency to the frequency of the subword pair
            pair_freqs[pair] += freq
    # Get the most frequent subword pair from the counter
    pair, _ = pair_freqs.most_common(1)[0]
    return pair

def merge_pair(
    target_pair: tuple[str, str], splits: dict[str, list[str]]
) -> dict[str, list[str]]:
    """
    Merge a pair of subwords
    """
    l_str, r_str = target_pair
    for word in word_freqs:  # Process all words
        split = splits[word]  # Get the current split state of the word
        i = 0
        # Process all adjacent pairs of subwords
        while i < len(split) - 1:
            # Merge if the subword pair matches the target pair
            if split[i] == l_str and split[i + 1] == r_str:
                split = split[:i] + [l_str + r_str] + split[i + 2 :]
            i += 1
        splits[word] = split  # Update the current merge state
    return splits

In [3]:
for step in range(9):
    # Compute the most frequently occurring adjacent subword pair
    target_pair = compute_most_frequent_pair(splits)
    # Merge the pair of subwords
    splits = merge_pair(target_pair, splits)
    # Add the pair of subwords to the vocabulary
    vocab.append(target_pair)

In [4]:
print(vocab)

['い', 'う', 'く', 'さ', 'し', 'た', 'つ', 'の', ('し', 'い'), ('た', 'の'), ('たの', 'しい'), ('う', 'つ'), ('うつ', 'く'), ('うつく', 'しい'), ('し', 'さ'), ('たの', 'しさ'), ('うつく', 'しさ')]


### 3.6.3 Handling Japanese

In [5]:
!pip -q install transformers[ja,sentencepiece,torch]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/71.7 MB[0m [31m9.9 MB/s[0m eta 

In [6]:
from transformers import AutoTokenizer

mbert_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-multilingual-cased"
)
# Tokenize the Japanese phrase "自然言語処理"
# (which translates to "Natural Language Processing")
print(mbert_tokenizer.tokenize("自然言語処理"))

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

['自', '然', '言', '語', '処', '理']


In [7]:
# Tokenize the Japanese phrase "自然言語処理にディープラーニングを使う"
# (which translates to "Using deep learning in natural language processing")
print(mbert_tokenizer.tokenize("自然言語処理にディープラーニングを使う"))

['自', '然', '言', '語', '処', '理', 'に', '##ディ', '##ープ', '##ラー', '##ニング', '##を', '使', 'う']


In [8]:
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
# Tokenize the Japanese phrase "自然言語処理にディープラーニングを使う"
# (which translates to "Using deep learning in natural language processing")
print(xlmr_tokenizer.tokenize("自然言語処理にディープラーニングを使う"))

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

['▁', '自然', '言語', '処理', 'に', 'ディー', 'プラ', 'ー', 'ニング', 'を使う']


In [9]:
# Tokenize the Japanese phrase "私は日本で生まれました"
# (which translates to "I was born in Japan")
print(xlmr_tokenizer.tokenize("私は日本で生まれました"))

['▁私は', '日本で', '生まれ', 'ました']


In [10]:
# Tokenize the Japanese phrase "本日はよろしくお願いいたします"
# (which translates to "Thank you for your cooperation today" or "Please treat me well today")
print(xlmr_tokenizer.tokenize("本日はよろしくお願いいたします"))

['▁本', '日は', 'よろしくお願いいたします']


In [11]:
bert_ja_tokenizer = AutoTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-v3"
)
# Tokenize the Japanese phrase "自然言語処理にディープラーニングを使う"
# (which translates to "Using deep learning in natural language processing")
print(
    bert_ja_tokenizer.tokenize("自然言語処理にディープラーニングを使う")
)

tokenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

['自然', '言語', '処理', 'に', 'ディープ', 'ラー', '##ニング', 'を', '使う']


In [12]:
# Tokenize the Japanese phrase "私は日本で生まれました"
# (which translates to "I was born in Japan")
print(bert_ja_tokenizer.tokenize("私は日本で生まれました"))

['私', 'は', '日本', 'で', '生まれ', 'まし', 'た']
