通用函数

In [1]:
import re
from collections import defaultdict

first = {'ch': 'i',
         'sh': 'u',
         'zh': 'v'}

second = {
    'ua': 'w',
    'ei': 'z',
    'e': 'e',
    'ou': 'b',
    'iu': 'q',
    've': 't',
    'ue': 't',
    'u': 'u',
    'i': 'i',
    'o': 'o',
    'uo': 'o',
    'ie': 'x',
    'a': 'a',
    'ong': 's',
    'iong': 's',
    'ai': 'l',
    'ing': 'y',
    'uai': 'y',
    'ang': 'h',
    'uan': 'r',
    'an': 'j',
    'en': 'f',
    'ia': 'w',
    'iang': 'd',
    'uang': 'd',
    'eng': 'g',
    'in': 'n',
    'ao': 'k',
    'v': 'v',
    'ui': 'v',
    'un': 'p',
    'iao': 'c',
    'ian': 'm'
}

# 特殊，只有䪨母，且总长不过 3
# 零声母，单双三䪨母
special = {
    'a': 'aa',
    'ai': 'ai',
    'an': 'an',
    'ang': 'ah',
    'ao': 'ao',
    'e': 'ee',
    'ei': 'ei',
    'en': 'en',
    'er': 'er',
    'o': 'oo',
    'ou': 'ou',

    # 唵嘛呢叭咪吽
    'ong': 'os'
}

def to_double(s: str) -> str:
    """
    传入单汉字的全拼编码，反回其自然码双拼编码

    :param s: 全拼编码
    :return: 双拼编码
    """
    new_s = ''
    # 特列情况: 无声母，a, an, ang
    if len(s) <= 3 and s[0] in ['a', 'e', 'o']:
        if s in special.keys():
            return special[s]
        else:
            print('未知情况1', s)

    # 一般: 声母 + 䪨母

    # 最长的情况：first+second，例如 chuang = ch + uang
    # 2 位声母 + 最多 4 位韵母
    if s[:2] in first.keys():
        new_s += first[s[:2]]
        # 最多 4 位䪨母
        if s[2:] in second.keys():
            new_s += second[s[2:]]
    # 较短的情况：second+second，例如 h uang, x iang
    # 1 位声母 + 最多 4 位䪨母
    else:
        new_s += s[0]  # 1 位声母
        # 最多 4 位䪨母
        if s[1:] in second.keys():
            new_s += second[s[1:]]
        else:
            new_s += s[1:]

    return new_s


pattern = re.compile(r'^[a-zA-Z"]+$')
def is_en(text):
    return bool(pattern.match(text))

# 中文分成每一个字，英文分成一个单词
def split_zh_en(text):
    pattern = re.compile(r'[^a-zA-Z]|[a-zA-Z]+')
    return pattern.findall(text)

构造自然码+形码码表

In [2]:
standard = set()

with open('dicts/8105.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, _ = line.split()
            standard.add(char)

xcodes = defaultdict(set)

# 鹤形/快手
# with open('./dicts/flypy_zrmfast.dict.yaml', 'r', encoding='utf-8') as f:
#     for line in f:
#         line = line.rstrip()
#         if re.match(r'^.\t[\w\[]+.*$', line):
#             char, code = line.split('\t')[:2]
#             if char not in standard:
#                 continue
#             code = code[code.rfind('[')+1:]
#             if code:
#                 xcodes[char].add(code)

# 汉心/易学
with open('./dicts/手心辅易学码9.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.=.*$', line):
            char, code = line.split('=')
            if char not in standard:
                continue
            if code:
                xcodes[char].add(code)                

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
...

"""

with open('../zrm.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2:
    f1.write(predefined)
    for line in f2:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in xcodes:
                if len(xcodes[char]) > 1:
                    multiple.add(char)
                for code in xcodes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                f1.write(f"{char}\t{to_double(pinyin)};\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

1081个字的辅助码缺失：{'漴', '祲', '涘', '琯', '伾', '褟', '郤', '蔀', '偭', '椀', '玙', '𫗴', '渰', '𬯀', '镕', '朓', '䴕', '蒨', '珣', '鹒', '㥄', '沘', '萳', '宬', '沨', '珹', '鄀', '塆', '虸', '媭', '衒', '鲬', '訚', '绖', '椸', '桯', '砄', '垏', '烔', '䃎', '翈', '𬇹', '㟃', '𬴃', '㮾', '埫', '姞', '洴', '屃', '洓', '牥', '夬', '槚', '挦', '𥔲', '棻', '𬜯', '峃', '甦', '蓏', '浭', '耇', '鳉', '䲟', '晐', '𬸦', '鲖', '琔', '𬳵', '婠', '滫', '晢', '𪟝', '祊', '荙', '碈', '玒', '燚', '硍', '鸼', '焆', '瑳', '垚', '嫄', '㺄', '晱', '嫽', '喤', '𬭚', '鄌', '羕', '叚', '麑', '楩', '鲹', '垿', '㛃', '秬', '剅', '嵅', '棓', '璘', '陞', '浥', '圌', '𫫇', '伈', '塝', '浰', '潟', '冇', '㳇', '轪', '郚', '瘆', '𬃊', '硿', '𫓹', '垟', '𬶟', '𩾃', '纮', '崚', '𠙶', '镃', '玓', '𬊤', '鄠', '姈', '𧿹', '𫚖', '訄', '濋', '榖', '佽', '焌', '柖', '燊', '洿', '瑑', '𬟁', '𬘭', '溍', '昽', '臑', '晫', '𫘜', '淜', '俵', '侁', '鹐', '瑨', '卬', '髎', '玘', '汭', '琟', '蟏', '鲃', '嫕', '滧', '鄘', '鸮', '朏', '豮', '鞳', '枹', '嵲', '帡', '𨐈', '玤', '鞁', '憕', '藠', '淟', '巇', '筜', '陑', '茓', '锧', '𬘩', '颙', '嵎', '𫟹', '珸', '褯', '汫', '鹴', '掞', '矞', '櫆', '䦃', '昳', '沺', '桹', '祼', '觭', '

In [3]:
val_dict = defaultdict(set)

with open('../zrm.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            val_dict[code].add(char)

dups = [f"{value}[{key}]" for key, value in val_dict.items() if len(value) > 1 and len(key) > 3]
max_dup = max(map(len, val_dict.values()))
print(len(dups), max_dup)
print('\n'.join(dups))

729 24
{'哟', '唷'}[yo;ky]
{'哎', '嗳'}[ai;ka]
{'𩽾', '胺'}[an;ya]
{'熬', '聱'}[ao;ae]
{'璈', '敖'}[ao;wa]
{'爸', '捌', '扒', '把'}[ba;fb]
{'吧', '叭'}[ba;kb]
{'芭', '菝'}[ba;cb]
{'摆', '捭'}[bl;fb]
{'伯', '佰'}[bl;rb]
{'湴', '半'}[bj;da]
{'斑', '班'}[bj;ww]
{'搬', '拌'}[bj;fb]
{'扳', '扮'}[bj;ff]
{'瘢', '癍'}[bj;bb]
{'谤', '膀'}[bh;yp]
{'爆', '炮'}[bk;hb]
{'胞', '鲍', '雹', '褓'}[bk;yb]
{'苞', '葆'}[bk;cb]
{'背', '臂'}[bz;by]
{'毕', '毙'}[bi;bu]
{'避', '舭', '跸'}[bi;zb]
{'臂', '璧'}[bi;by]
{'壁', '坒'}[bi;bt]
{'蓖', '蔽', '萆', '荜', '芘'}[bi;cb]
{'婢', '妣'}[bi;nb]
{'弊', '痹'}[bi;ba]
{'庳', '髀', '庇'}[bi;gb]
{'襞', '裨'}[bi;yb]
{'吡', '哔'}[bi;kb]
{'筚', '篦'}[bi;vb]
{'皕', '毖'}[bi;bb]
{'滗', '泌', '濞'}[bi;db]
{'秘', '秕'}[bi;hb]
{'鳊', '褊'}[bm;yb]
{'苄', '萹'}[bm;cb]
{'杓', '标'}[bc;mu]
{'幖', '镖'}[bc;jp]
{'熛', '彪'}[bc;hp]
{'膘', '鳔'}[bc;yp]
{'殡', '滨'}[bn;db]
{'柄', '槟'}[by;mb]
{'饼', '屏'}[by;ub]
{'拨', '播'}[bo;ff]
{'渤', '泊'}[bo;db]
{'铂', '钵'}[bo;jb]
{'饽', '孛'}[bo;uz]
{'舶', '箔'}[bo;vb]
{'不', '卜'}[bu;ad]
{'材', '睬'}[cl;mc]
{'喳', '嚓'}[ia;ki]
{'馇', '碴'}[ia;ui]
{'槎', '

词库转换准备

In [4]:
zrm_phones = defaultdict(list)
zrm_codes = {}
zrm_path = '../zrm.dict.yaml'

with open(zrm_path) as f:
  for line in f:
    if len(line.split('\t')) != 3:
      continue
    char, code = line.rstrip().split('\t')[:2]
    yin, xing = code.split(';')
    zrm_phones[char].append(yin)
    zrm_codes[char] = xing

def convert_pinyin(infile, outfile):
  with open(infile) as f1, open(outfile, 'w') as f2:
    for line in f1:
      if not re.match(r'^.+\t.+(\t\d+)*$', line):
        f2.write(line)
        continue
      parts = line.rstrip().split('\t')
      chars, phones = split_zh_en(parts[0]), parts[1].split(' ')
      if len(chars) < 1:  # 排除单字
        continue

      if any(is_en(char) for char in chars): # 存在英语
        doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ''.join(doubled)
      elif all((char in zrm_phones and to_double(phone) in zrm_phones[char]) for char,phone in zip(chars, phones)):
        doubled = (phone if is_en(char) else to_double(phone) + ';' + zrm_codes[char] for char,phone in zip(chars, phones))
        doubled_code = ' '.join(doubled)
      else: # 存在词库读音与字库设置读音不符
        continue

      if len(parts) < 3:
        f2.write(''.join(chars) + '\t' + doubled_code + '\n')
      else:
        freq = parts[2]
        f2.write(''.join(chars) + '\t' + doubled_code + '\t' + freq + '\n')

自定义词库转换

In [5]:
convert_pinyin('./dicts/custom.dict.yaml', '../dicts/custom.dict.yaml')

其他词库转换

In [6]:
convert_pinyin('./dicts/base.dict.yaml', '../dicts/base.dict.yaml')
convert_pinyin('./dicts/ext.dict.yaml', '../dicts/ext.dict.yaml')