In [28]:
from tokenizers import Tokenizer

# Basic Tokenization Task
Write a script to tokenize a simple sentence using basic whitespace and punctuation splitting.

In [29]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

In [30]:
sentence = "Hello, my dog is cute."
res = tokenizer.encode(sentence)
res

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [31]:
from tokenizers import Encoding

for attr in ('ids', 'type_ids', 'tokens', 'offsets', 'attention_mask', 'special_tokens_mask', 'overflowing'):
    print(attr, ':', getattr(res, attr))
    

ids : [101, 7592, 1010, 2026, 3899, 2003, 10140, 1012, 102]
type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens : ['[CLS]', 'hello', ',', 'my', 'dog', 'is', 'cute', '.', '[SEP]']
offsets : [(0, 0), (0, 5), (5, 6), (7, 9), (10, 13), (14, 16), (17, 21), (21, 22), (0, 0)]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 1]
overflowing : []


In [32]:
print(tokenizer.encode("我今天去公园散步。").tokens)
print(tokenizer.encode("Wǒ jīntiān qù gōngyuán sànbù.").tokens)

['[CLS]', '我', '[UNK]', '天', '[UNK]', '公', '[UNK]', '[UNK]', '[UNK]', '。', '[SEP]']
['[CLS]', 'wo', 'jin', '##tian', 'qu', 'gong', '##yuan', 'san', '##bu', '.', '[SEP]']


# Custom Tokenizer
Implement a custom tokenizer from scratch that handles edge cases like contractions (e.g., "don't"), hyphenated words, and acronyms.

In [33]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import NFC, Lowercase, BertNormalizer

In [34]:
sentence = "I don't care about byte-pair SOTA tokenizers 😁."
t = Tokenizer(model=BPE())

res = t.encode(sentence)
res.tokens

[]

In [35]:
for normalizer in (NFC(), Lowercase(), BertNormalizer()):
    print(normalizer.normalize_str(sentence))

t.normalizer = BertNormalizer()
t.encode(sentence).tokens

I don't care about byte-pair SOTA tokenizers 😁.
i don't care about byte-pair sota tokenizers 😁.
i don't care about byte-pair sota tokenizers 😁.


[]

In [36]:
from tokenizers.pre_tokenizers import WhitespaceSplit, BertPreTokenizer

for pre_tokenizer in (WhitespaceSplit(), BertPreTokenizer()):
    s_norm = t.normalizer.normalize_str(sentence)
    print(pre_tokenizer.pre_tokenize_str(s_norm))

t.pre_tokenizer = BertPreTokenizer()
t.encode(sentence).tokens

[('i', (0, 1)), ("don't", (2, 7)), ('care', (8, 12)), ('about', (13, 18)), ('byte-pair', (19, 28)), ('sota', (29, 33)), ('tokenizers', (34, 44)), ('😁.', (45, 47))]
[('i', (0, 1)), ('don', (2, 5)), ("'", (5, 6)), ('t', (6, 7)), ('care', (8, 12)), ('about', (13, 18)), ('byte', (19, 23)), ('-', (23, 24)), ('pair', (24, 28)), ('sota', (29, 33)), ('tokenizers', (34, 44)), ('😁', (45, 46)), ('.', (46, 47))]


[]

In [37]:
t.model = BPE()
t.encode(sentence).tokens

[]

In [38]:
from tokenizers.processors import BertProcessing
t.post_processor = BertProcessing(("[CLS]", tokenizer.token_to_id("[CLS]")),
                                  ("[SEP]", tokenizer.token_to_id("[SEP]")))
t.encode(sentence).tokens

['[SEP]', '[CLS]']

In [39]:
from datasets import load_dataset
dataset = load_dataset('squad', split='train')

In [40]:
print(len(dataset))
print(dataset[0]['context'])

87599
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [41]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(
    vocab_size=10000,
    special_tokens=["[CLS]", "[SEP]", "[PAD]", "[UNK]", "[MASK]"],
)

my_custom_dataset = (el['context'] for el in dataset)
t.train_from_iterator(my_custom_dataset, trainer=trainer)
t.save("custom_bpe_tokenizer.json")




In [42]:
print(t.encode("Hello, my dog is cute.").tokens)
print(t.encode('Hey I don\'t care about byte-pair SOTA tokenizers 😁.').tokens)

# I went to the park for a walk today.
print(t.encode("我今天去公园散步。").tokens)
print(t.encode("Wǒ jīntiān qù gōngyuán sànbù.").tokens)

['[SEP]', 'hel', 'lo', ',', 'my', 'dog', 'is', 'cu', 'te', '[CLS]']
['[SEP]', 'he', 'y', '[CLS]']
['[SEP]', '今', '天', '公', '。', '[CLS]']
['[SEP]', 'wo', 'j', 'int', 'ian', 'qu', 'g', 'ong', 'yuan', 'san', 'bu', '[CLS]']


# Word Tokenization Using NLTK
Use the NLTK library to tokenize sentences and words from a document. Compare the results with your custom tokenizer.

# Subword Tokenization (BPE)
Implement Byte Pair Encoding (BPE) to tokenize a text corpus into subword units. Test this with different vocab sizes.

# Pre-trained Tokenizer (Hugging Face)
Load a pre-trained tokenizer from the Hugging Face library (e.g., BERT or GPT) and tokenize text inputs. Explore different encoding options (padding, truncation).

In [43]:
import tiktoken

t = tiktoken.get_encoding("cl100k_base")
t1 = tiktoken.encoding_for_model("gpt-4o")

In [44]:
print(t.encode(sentence))
print(t1.encode(sentence))

[40, 1541, 956, 2512, 922, 5027, 2320, 1334, 328, 37644, 4037, 12509, 27623, 223, 13]
[40, 4128, 2631, 1078, 9239, 3161, 1517, 336, 61390, 6602, 24223, 22861, 223, 13]


In [45]:
# generate text based on tokens
print(t.decode(t.encode(sentence)))
print(t.decode([1, 2, 3, 4, 5, 1000, 1001, 1002, 1003, 1004, 1005, 5000, 5001, 5002, 5003]))

I don't care about byte-pair SOTA tokenizers 😁.
"#$%&indowlementpectash[i use WebDesBCancial


In [46]:
for s in ("我今天去公园散步。", "Wǒ jīntiān qù gōngyuán sànbù."):
    print(t.encode(s))
    print(t.decode(t.encode(s)))

[37046, 37271, 36827, 86436, 35417, 9921, 255, 8067, 96, 65782, 1811]
我今天去公园散步。
[54, 131, 240, 503, 61711, 406, 72, 31757, 77, 2874, 15273, 342, 56761, 983, 41101, 11644, 274, 6496, 18571, 15273, 13]
Wǒ jīntiān qù gōngyuán sànbù.


# SentencePiece Tokenization
Use the SentencePiece library to build a subword tokenizer and tokenize a text file. Compare the performance and output with BPE.

In [47]:
import sentencepiece as spm

In [48]:
my_custom_dataset = (el['context'] for el in dataset)
spm.SentencePieceTrainer.train(sentence_iterator=my_custom_dataset, model_prefix='custom_sp_model', vocab_size=1000, user_defined_symbols=['foo', 'bar'])

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: custom_sp_model
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: foo
  user_defined_symbols: bar
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 

In [49]:
sp = spm.SentencePieceProcessor(model_file='custom_sp_model.model')
print(sp.encode("This is a test sentence.", out_type=str))

['▁This', '▁is', '▁a', '▁', 't', 'est', '▁', 's', 'ent', 'ence', '.']


In [50]:
# I went to the park for a walk today.
print(sp.encode("我今天去公园散步。", out_type=str))
print(sp.encode("Wǒ jīntiān qù gōngyuán sànbù.", out_type=str))

['▁', '我今天去公园散步。']
['▁W', 'ǒ', '▁', 'j', 'ī', 'n', 'ti', 'ā', 'n', '▁', 'q', 'ù', '▁g', 'ō', 'n', 'g', 'y', 'u', 'á', 'n', '▁', 's', 'à', 'n', 'b', 'ù', '.']


# Fast Tokenization (Tokenizers Library)
Use the Hugging Face tokenizers library to tokenize text at high speed. Experiment with different tokenization algorithms (WordPiece, BPE, etc.).

# Encoding and Decoding
Encode a piece of text using a tokenizer and then decode it back to the original form. Verify that the decoding process produces the same text.

# Tokenization on a Multilingual Dataset
Tokenize text in multiple languages (e.g., English, French, Chinese). Analyze the differences in how the tokenizer handles different languages.

# Build a Tokenizer Pipeline
Chain multiple steps of a tokenizer pipeline (e.g., text cleaning, tokenization, padding, and special token addition) using a framework like Hugging Face's transformers. Test it on a text classification task.