通用函数

In [1]:
import re
from collections import defaultdict

first = {'ch': 'i',
         'sh': 'u',
         'zh': 'v'}

second = {
    'ua': 'w',
    'ei': 'z',
    'e': 'e',
    'ou': 'b',
    'iu': 'q',
    've': 't',
    'ue': 't',
    'u': 'u',
    'i': 'i',
    'o': 'o',
    'uo': 'o',
    'ie': 'x',
    'a': 'a',
    'ong': 's',
    'iong': 's',
    'ai': 'l',
    'ing': 'y',
    'uai': 'y',
    'ang': 'h',
    'uan': 'r',
    'an': 'j',
    'en': 'f',
    'ia': 'w',
    'iang': 'd',
    'uang': 'd',
    'eng': 'g',
    'in': 'n',
    'ao': 'k',
    'v': 'v',
    'ui': 'v',
    'un': 'p',
    'iao': 'c',
    'ian': 'm'
}

# 特殊，只有䪨母，且总长不过 3
# 零声母，单双三䪨母
special = {
    'a': 'aa',
    'ai': 'ai',
    'an': 'an',
    'ang': 'ah',
    'ao': 'ao',
    'e': 'ee',
    'ei': 'ei',
    'en': 'en',
    'er': 'er',
    'o': 'oo',
    'ou': 'ou',

    # 唵嘛呢叭咪吽
    'ong': 'os'
}

def to_double(s: str) -> str:
    """
    传入单汉字的全拼编码，反回其自然码双拼编码

    :param s: 全拼编码
    :return: 双拼编码
    """
    new_s = ''
    # 特列情况: 无声母，a, an, ang
    if len(s) <= 3 and s[0] in ['a', 'e', 'o']:
        if s in special.keys():
            return special[s]
        else:
            print('未知情况1', s)

    # 一般: 声母 + 䪨母

    # 最长的情况：first+second，例如 chuang = ch + uang
    # 2 位声母 + 最多 4 位韵母
    if s[:2] in first.keys():
        new_s += first[s[:2]]
        # 最多 4 位䪨母
        if s[2:] in second.keys():
            new_s += second[s[2:]]
    # 较短的情况：second+second，例如 h uang, x iang
    # 1 位声母 + 最多 4 位䪨母
    else:
        new_s += s[0]  # 1 位声母
        # 最多 4 位䪨母
        if s[1:] in second.keys():
            new_s += second[s[1:]]
        else:
            new_s += s[1:]

    return new_s


pattern = re.compile(r'^[a-zA-Z"]+$')
def is_en(text):
    return bool(pattern.match(text))

# 中文分成每一个字，英文分成一个单词
def split_zh_en(text):
    pattern = re.compile(r'[^a-zA-Z]|[a-zA-Z]+')
    return pattern.findall(text)

构造自然码+形码码表

In [2]:
standard = set()

with open('dicts/8105.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, freq = line.split()
            standard.add(char)

# 鹤形
# xcodes = defaultdict(set)
# with open('./dicts/flypy_flypy.dict.yaml', 'r', encoding='utf-8') as f:
#     for line in f:
#         line = line.rstrip()
#         if re.match(r'^.\t[\w\[]+.*$', line):
#             char, code = line.split('\t')[:2]
#             if char not in standard:
#                 continue
#             code = code[code.rfind('[')+1:]
#             if code:
#                 xcodes[char].add(code)

# 键道6
parts = {
    '一': 'a', # jd6默认 v
    '贝': 'b', # jd6默认 ao
    '草': 'c', # jd6默认 ii
    '丶': 'd',
    # e
    '手': 'f', # jd6默认 iu
    # g
    # i=ch
    '金': 'j', # jd6默认 io
    '口': 'k', # jd6默认 o
    '丨': 'l',
    '木': 'm', # jd6默认 v
    '日': 'o', # jd6默认 oi
    '丿': 'p',
    # q
    '人': 'r', # jd6默认 r
    '土': 't', # jd6默认 vo
    '士': 't', # jd6默认 vo
    # u=sh
    '水': 'u', # jd6默认 a
    '十': 'x', # jd6默认 ui
    '乛': 'v',
    '月': 'y', # jd6默认 u
}

def get_chaizi(path):
    with open(path, 'r', encoding='utf-8') as f:
        chaizi = {}
        for line in f:
            line = line.rstrip()
            if not line or line.startswith('#'):
                continue
            char, parts = line.split('\t')[:2]
            if char not in standard or parts == '未知':
                continue
            chaizi[char] = parts
        return chaizi

chaizi_jd6 = get_chaizi('./dicts/chaizi_jd6.txt')
chaizi_jd6['搬'] = "手丿丿乛"
xcodes = {key: [''.join([parts[c] if len(parts[c]) == 1 else to_double(parts[c]) for c in value])] for key, value in chaizi_jd6.items()}

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
...

"""

with open('../zrm.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2:
    f1.write(predefined)
    for line in f2:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in xcodes:
                if len(xcodes[char]) > 1:
                    multiple.add(char)
                for code in xcodes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                f1.write(f"{char}\t{to_double(pinyin)};\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

1125个字的辅助码缺失：{'螣', '媂', '𫐓', '疐', '朏', '𬬿', '蹐', '𬍤', '辿', '皭', '僇', '瑳', '砵', '𩾌', '媭', '𬶍', '蝘', '菼', '柊', '𫚕', '哱', '硔', '瑔', '涘', '𫘜', '扅', '埼', '訄', '戭', '崒', '𫐄', '靰', '鬶', '甗', '蓏', '𫓶', '鹡', '璪', '嫕', '虷', '硿', '勔', '𫔍', '尨', '瑃', '刬', '荙', '𫔎', '齉', '燊', '㟃', '濋', '憺', '浲', '禔', '掞', '瑀', '㳘', '垾', '𬤊', '䏲', '䗪', '棬', '婤', '偭', '芔', '㬊', '𦰡', '𬍡', '篯', '𬣙', '黇', '斠', '秬', '阇', '昫', '慥', '礌', '珫', '匜', '筶', '凓', '瓀', '焌', '儳', '㺄', '湉', '鼫', '嗐', '𬭛', '榑', '璒', '嶓', '潽', '鿏', '屃', '藦', '袯', '崌', '僰', '恔', '鹢', '晱', '衎', '嵅', '𦝼', '𬴃', '𣗋', '碃', '霅', '汋', '㕮', '翃', '嫪', '杻', '炌', '佁', '𬟁', '龂', '龢', '𬶋', '漈', '𬟽', '𡎚', '澂', '杧', '鄫', '谼', '圐', '溞', '洈', '峱', '楪', '栴', '秾', '𥕢', '貆', '𦒍', '襚', '𫮃', '岊', '䲠', '闿', '浰', '墡', '褯', '窎', '扊', '𬊤', '姱', '葎', '瑑', '珇', '膙', '烔', '洴', '玚', '爟', '漷', '纼', '𫫇', '喤', '㬚', '祼', '猰', '𬬻', '汈', '嫽', '梽', '蘘', '𪤗', '罶', '枍', '筜', '𬬭', '韨', '徛', '𠙶', '炆', '叚', '𬸪', '滍', '赪', '咍', '盉', '𬂩', '𫐐', '𬭤', '墣', '鹀', '赗', '弢', '𬶏', '漻', '𬹼', '垟', '𬒔', '

重码

In [6]:
code2chars = defaultdict(set)
charcode2freq = dict()

with open('../zrm.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            code2chars[code].add(char)
            charcode2freq[char + code] = int(freq)

with open('../zrm.dict.yaml', 'r') as file:
    lines = file.readlines()

with open('../zrm.dict.yaml', 'r') as file:
    for line in lines:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            if len(code2chars[code]) > 1:
                print(code2chars[code])
        #         fs = [charcode2freq[c + code] for c in code2chars[code]]
        #         fs.sort(reverse=True)
        #         code = code + fs.index(int(freq)) * 'j'
        #         line = '\t'.join([char, code, freq])
        # file.write(line + '\n')

{'薙', '趯', '媞', '遆', '䏲', '𫘨', '擿', '䴘', '瑅'}
{'衃', '珮'}
{'叆', '瑷'}
{'叆', '瑷'}
{'嶅', '螯'}
{'嶅', '螯'}
{'粑', '羓'}
{'蚆', '岜'}
{'蚆', '岜'}
{'粑', '羓'}
{'瘢', '瓣'}
{'瘢', '瓣'}
{'梆', '棒'}
{'梆', '棒'}
{'饱', '鸨', '鲍'}
{'饱', '鸨', '鲍'}
{'饱', '鸨', '鲍'}
{'襞', '辟'}
{'哔', '吡'}
{'哔', '吡'}
{'芘', '荜'}
{'蓖', '萆'}
{'筚', '秕'}
{'筚', '秕'}
{'芘', '荜'}
{'襞', '辟'}
{'蓖', '萆'}
{'馝', '秘'}
{'馝', '秘'}
{'遍', '褊'}
{'遍', '褊'}
{'𨚕', '抃', '萹'}
{'𨚕', '抃', '萹'}
{'𨚕', '抃', '萹'}
{'儦', '瀌', '脿', '幖', '藨'}
{'儦', '瀌', '脿', '幖', '藨'}
{'儦', '瀌', '脿', '幖', '藨'}
{'儦', '瀌', '脿', '幖', '藨'}
{'儦', '瀌', '脿', '幖', '藨'}
{'蛃', '昺'}
{'蛃', '昺'}
{'钵', '镈'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'钵', '镈'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'欂', '浡', '袯', '𬭛', '哱', '僰', '嶓', '砵'}
{'埗', '𬷕', '蔀'}
{'埗', '𬷕', '蔀'}
{'埗', '𬷕', '蔀'}
{'芔', '𥕢', 

词库转换准备

In [3]:
zrm_phones = defaultdict(list)
zrm_codes = {}
zrm_path = '../zrm.dict.yaml'

with open(zrm_path) as f:
  for line in f:
    if len(line.split('\t')) != 3:
      continue
    char, code = line.rstrip().split('\t')[:2]
    yin, xing = code.split(';')
    zrm_phones[char].append(yin)
    zrm_codes[char] = xing

def convert_pinyin(infile, outfile, override_freq=None):
  with open(infile) as f1, open(outfile, 'w') as f2:
    for line in f1:
      if not re.match(r'^.+\t.+(\t\d+)*$', line):
        f2.write(line)
        continue
      parts = line.rstrip().split('\t')
      chars, phones = split_zh_en(parts[0]), parts[1].split(' ')
      if len(chars) < 1:  # 排除单字
        continue

      if any(is_en(char) for char in chars): # 存在英语
        doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ''.join(doubled)
      elif all((char in zrm_phones and to_double(phone) in zrm_phones[char]) for char,phone in zip(chars, phones)):
        doubled = (phone if is_en(char) else to_double(phone) + ';' + zrm_codes[char] for char,phone in zip(chars, phones))
        # doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ' '.join(doubled)
      else: # 存在词库读音与字库设置读音不符
        continue

      if len(parts) < 3:
        if override_freq:
          f2.write(''.join(chars) + '\t' + doubled_code + '\t' + override_freq + '\n')
        else:
          f2.write(''.join(chars) + '\t' + doubled_code + '\n')
      else:
        freq = parts[2]
        f2.write(''.join(chars) + '\t' + doubled_code + '\t' + freq + '\n')

自定义词库转换

In [5]:
convert_pinyin('./dicts/custom.dict.yaml', '../dicts/custom.dict.yaml', override_freq="1000")

其他词库转换

In [6]:
convert_pinyin('./dicts/base.dict.yaml', '../dicts/base.dict.yaml')
convert_pinyin('./dicts/ext.dict.yaml', '../dicts/ext.dict.yaml')