In [17]:
import os
import sys
sys.path.insert(0, os.path.abspath('.'))
import json
from collections import defaultdict
from utils import parse_label

def dump_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

seq_file = (
    'E:/donny/code/school/research/chujian/data/sequences/seq_texts.json')
seqs = json.load(open(seq_file, 'r', encoding='utf-8'))
seqs = [seq['text'] for seq in seqs]
seqs = [[parse_label(c, use_comb_token=False) for c in seq] for seq in seqs]
print(seqs[:10])

[['上', '不', '乍', '二', '伐', '伐', '兄', '光', '兵', '出', '吉', '君', '君', '坪', '居', '居', '居', '左', '己', '𠭁', '𠭁', '𠭁', '𠭁', '旬', '旬', '星', '是', '是', '東', '畜', '相', '箸', '胃', '自', '雨', '雨', '首', '{文一}', '{文一}', '{文一}', '{文一}', '{車丙}', '[UNK]', '[UNK]'], ['一', '三', '不', '乙', '二', '亥', '亥', '以', '以', '以', '以', '八', '出', '出', '利', '利', '利', '可', '可', '壬', '女', '女', '女', '子', '居', '左', '𠭁', '必', '日', '旬', '木', '欠', '死', '甬', '甲', '甲', '發', '白', '色', '行', '視', '軍', '量', '黃', '{一戈月}', '{井田土}', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], ['其', '二', '在', '長', '{尾少}'], ['絇'], ['[UNK]'], ['[UNK]', '重', '鎰', '[UNK]', '足', '[UNK]', '重', '八', '鎰', '[UNK]', '鎰', '一', '銖'], ['十', '月', '乙', '丑'], [], ['之', '上', '與', '𫺕', '哲', '王', '之', '威', '俈', '{辶卜}', '尹', '郘', '逯', '㠯', '王', '命', '賜', '舒', '方', '御', '歲', '愲'], ['[UNK]']]


# Pick words to add to vocabulary

For words that are not in the original vocabulary, if it appear less than $k=10$ times, map it to "？" (fullwidth, because it's more frequent in the pretraining data), else, add it to the vocabulary. The "？" is already in the vocabulary, so we only add words that appear more than $k$ times.

In [18]:
# Load tokenizer
from transformers import AutoTokenizer
model_name = "KoichiYasuoka/roberta-classical-chinese-base-char"
# model_name = "ethanyt/guwenbert-base"
cache_dir = "E:/.cache/huggingface"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

In [23]:
from typing import List

def iter_seqs(seqs: List[List[str]]):
    for seq in seqs:
        for c in seq:
            yield c

word_cnt = defaultdict(int)
for c in iter_seqs(seqs):
    word_cnt[c] += 1

k = 10
orig_vocab = tokenizer.vocab
new_tokens = {'[COMB]'}
for c in iter_seqs(seqs):
    if c not in orig_vocab and word_cnt[c] >= k:
        new_tokens.add(c)
new_tokens = list(new_tokens)
print(f"Num of new tokens: {len(new_tokens)}")
dump_json(new_tokens, 'new_vocab.json')


Num of new tokens: 101


In [24]:
# Update the vocab of tokenizer
tokenizer.add_tokens(new_tokens)
print('Saving tokenizer to ../tokenization/tokenizer')
tokenizer.save_pretrained('../tokenization/tokenizer')

Saving tokenizer to ../tokenization/tokenizer


('../tokenization/tokenizer\\tokenizer_config.json',
 '../tokenization/tokenizer\\special_tokens_map.json',
 '../tokenization/tokenizer\\vocab.txt',
 '../tokenization/tokenizer\\added_tokens.json',
 '../tokenization/tokenizer\\tokenizer.json')

In [25]:
# Test the tokenizer
text = "綊墿？箄{竹巫口}弩弓[COMB]𡄹{弓口二}？？[COMB]"
print(tokenizer.tokenize(text))

['綊', '墿', '？', '箄', '{ 竹  巫  口 }', '弩', '弓', '[comb]', ' 𡄹 ', '{ 弓  口  二 }', '？', '？', '[comb]']
