*   外字の変換手順
    *   青空文庫テキストデータにおいて，外字は `※［＃「王＋甘」、第4水準2-80-65］` のように表現されている
    *   まず `※※［＃` で始まる特殊なパターンを手動で変換
    *   その後，正規表現 `※［＃[^［］]*?］` にマッチするパターンを自動的に変換

In [67]:
from pathlib import Path
import re

from tqdm import tqdm

In [None]:
paths = list(Path('cards').glob('**/*.txt'))
paths.sort()
len(paths)

In [69]:
def jis_to_char(men, ku, ten):
    assert 1 <= men <= 2
    assert 1 <= ku <= 94
    assert 0 <= ten < 96
    jis_code = 0x2020 + 0x100 * ku + ten

    if men == 1:
        jis_bytes = b'\x1b$(Q' + bytes.fromhex(f'{jis_code:04x}') + b'\x1b(B'
    elif men == 2:
        jis_bytes = b'\x1b$(P' + bytes.fromhex(f'{jis_code:04x}') + b'\x1b(B'

    char = jis_bytes.decode('iso2022_jp_2004')

    return char

In [73]:
def get_gaiji_char(chuuki:str):
    # 「第X水準X-X-XX」を検出する
    match_obj = re.search(r'第(\d|三|四)水準\s*(\d+)-(\d+)-(\d+)', chuuki)
    if match_obj:
        suijun, men, ku, ten = match_obj.groups()
        men, ku, ten = map(int, [men, ku, ten])
        char = jis_to_char(men, ku, ten)
        return char

    # 「U+XXXX」を検出する
    match_obj = re.search(r'U\+[0-9A-Fa-f]{4,6}', chuuki)
    if match_obj:
        match_str = match_obj.group()
        codepoint_str = match_str[2:]
        codepoint = int(codepoint_str, 16)
        char = chr(codepoint)
        return char

    # 「X-XX-XX」を検出する（必ずしも JIS 面区点コードとは限らない）
    match_obj = re.search(r'(\d+)-(\d+)-(\d+)', chuuki)
    if match_obj:
        men, ku, ten = match_obj.groups()
        men, ku, ten = map(int, [men, ku, ten])
        try:
            char = jis_to_char(men, ku, ten)
        except Exception as e:
            print(chuuki)
            char = None
        return char

    return None

In [None]:
# 外字をリストアップする

chuukis = set()

for path in tqdm(paths):
    with open(path) as file:
        text = file.read()
    for match_obj in re.finditer(r'※?［＃[^［］]*?］', text):
        match_str = match_obj.group()
        chuukis.add(match_str)

print(len(chuukis))

with open('gaiji_list.txt', 'w') as output_file:
    for chuuki in chuukis:
        char = get_gaiji_char(chuuki)
        if char:
            output_file.write(f'{char},{chuuki}\n')

!sort gaiji_list.txt > gaiji_list_sorted.txt

In [None]:
# 外字を変換する

def replace_gaiji(match_obj):
    chuuki = match_obj.group()
    char = get_gaiji_char(chuuki)
    if char:
        return char
    else:
        return chuuki


for path in tqdm(paths):
    with open(path) as file:
        text = file.read()
    new_text, number_of_subs_made = re.subn(r'※［＃[^［］]*?］', replace_gaiji, text)
    if number_of_subs_made > 0:
        with open(path, 'w') as file:
            file.write(new_text)