In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [15]:
print(tokenizer.is_fast)
print(encoding.is_fast)
print(encoding.tokens())
print(encoding.word_ids())

True
True
['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']
[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]


In [17]:
s, e = encoding.word_to_chars(3)
print(example[s:e])

Sylvain


In [2]:
gen = (i for i in range(5))

print(list(gen))
print(list(gen))

[0, 1, 2, 3, 4]
[]


## Regex

In [18]:
import re
from tokenizers import Regex

text = "Hello 世界 123!"

# ❌ Python re 不支持 \p{L}
try:
    re.split(r"(\p{L}+)", text)  # 报错！
except re.error as e:
    print("re 报错:", e)  # bad escape \p at position 1

# ✅ tokenizers.Regex 支持 Unicode 属性
regex = Regex(r"(\p{L}+|\p{N}+|[^\p{L}\p{N}]+)")
# 这可以正确分割：字母、数字、其他符号

re 报错: bad escape \p at position 1


In [21]:
from tokenizers import Tokenizer, Regex, pre_tokenizers, models

# 创建一个基础 tokenizer（这里用 BPE，但预分词独立于模型）
tokenizer = Tokenizer(models.BPE())

# GPT-4 风格的分割正则（支持多语言）
SPLIT_PATTERN = r"""'(?:[sdmt]|re|ve|ll)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}|[^\s\p{L}\p{N}]+"""

# 设置预分词器
tokenizer.pre_tokenizer = pre_tokenizers.Split(
    Regex(SPLIT_PATTERN),
    behavior="isolated"  # 每个匹配到的部分都变成一个“预 token”
)

# 测试
text = "Hello 世界! I have 123 apples. $45.67"
print(tokenizer.pre_tokenizer.pre_tokenize_str(text))

[('Hello', (0, 5)), (' 世界', (5, 8)), ('!', (8, 9)), (' I', (9, 11)), (' have', (11, 16)), (' ', (16, 17)), ('12', (17, 19)), ('3', (19, 20)), (' apples', (20, 27)), ('.', (27, 28)), (' ', (28, 29)), ('$', (29, 30)), ('45', (30, 32)), ('.', (32, 33)), ('67', (33, 35))]


In [22]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# 或用 Regex 实现等效效果：
tokenizer.pre_tokenizer = pre_tokenizers.Split(
    Regex(r"\s+"), 
    behavior="removed"  # 空格不保留为 token
)

print(tokenizer.pre_tokenizer.pre_tokenize_str("aaa bbb ccc"))

[('aaa', (0, 3)), ('bbb', (4, 7)), ('ccc', (8, 11))]
