通用函数

In [1]:
import re
from collections import defaultdict
from itertools import chain

first = {'ch': 'i',
         'sh': 'u',
         'zh': 'v'}

second = {
    'ua': 'w',
    'ei': 'z',
    'e': 'e',
    'ou': 'b',
    'iu': 'q',
    've': 't',
    'ue': 't',
    'u': 'u',
    'i': 'i',
    'o': 'o',
    'uo': 'o',
    'ie': 'x',
    'a': 'a',
    'ong': 's',
    'iong': 's',
    'ai': 'l',
    'ing': 'y',
    'uai': 'y',
    'ang': 'h',
    'uan': 'r',
    'an': 'j',
    'en': 'f',
    'ia': 'w',
    'iang': 'd',
    'uang': 'd',
    'eng': 'g',
    'in': 'n',
    'ao': 'k',
    'v': 'v',
    'ui': 'v',
    'un': 'p',
    'iao': 'c',
    'ian': 'm'
}

# 特殊，只有䪨母，且总长不过 3
# 零声母，单双三䪨母
special = {
    'a': 'aa',
    'ai': 'ai',
    'an': 'an',
    'ang': 'ah',
    'ao': 'ao',
    'e': 'ee',
    'ei': 'ei',
    'en': 'en',
    'er': 'er',
    'o': 'oo',
    'ou': 'ou',

    # 唵嘛呢叭咪吽
    'ong': 'os',
    # 嗯
    'eng': 'eg',
}

def to_double(s: str) -> str:
    """
    传入单汉字的全拼编码，反回其自然码双拼编码

    :param s: 全拼编码
    :return: 双拼编码
    """
    new_s = ''
    # 特列情况: 无声母，a, an, ang, n
    if len(s) <= 3 and s[0] in ['a', 'e', 'o']:
        if s in special.keys():
            return special[s]
        else:
            print('未知情况1', s)

    # 一般: 声母 + 䪨母

    # 最长的情况：first+second，例如 chuang = ch + uang
    # 2 位声母 + 最多 4 位韵母
    if s[:2] in first.keys():
        new_s += first[s[:2]]
        # 最多 4 位䪨母
        if s[2:] in second.keys():
            new_s += second[s[2:]]
    # 较短的情况：second+second，例如 h uang, x iang
    # 1位声母 + 最多 4 位䪨母
    else:
        new_s += s[0]  # 1位声母
        # 最多 4 位䪨母
        if s[1:] in second.keys():
            new_s += second[s[1:]]
        else:
            new_s += s[1:]

    return new_s


pattern = re.compile(r'^[a-zA-Z"]+$')
def is_en(text):
    return bool(pattern.match(text))

# 中文分成每一个字，英文分成一个单词
def split_zh_en(text):
    pattern = re.compile(r'[^a-zA-Z]|[a-zA-Z]+')
    return pattern.findall(text)

构造自然码+形码码表

In [2]:
# 是否使用41448大字表
enable_40k = True

standard = set()

with open('dicts/8105.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, freq = line.split()
            standard.add(char)

if enable_40k:
    with open('dicts/41448.dict.yaml') as f:
        for line in f:
            line = line.rstrip()
            if re.match(r'^.\t\w+\t\d+$', line):
                char, _, freq = line.split()
                if char not in standard:
                    standard.add(char)

# 鹤形
# xcodes = defaultdict(set)
# with open('./dicts/flypy_flypy.dict.yaml', 'r', encoding='utf-8') as f:
#     for line in f:
#         line = line.rstrip()
#         if re.match(r'^.\t[\w\[]+.*$', line):
#             char, code = line.split('\t')[:2]
#             if char not in standard:
#                 continue
#             code = code[code.rfind('[')+1:]
#             if code:
#                 xcodes[char].add(code)

# 键道6
parts = {
    '一': 'a', # jd6默认 v
    '贝': 'b', # jd6默认 ao
    '草': 'c', # jd6默认 ii
    '丶': 'd',
    # e
    '手': 'f', # jd6默认 iu
    # g
    # i=ch
    '金': 'j', # jd6默认 io
    '口': 'k', # jd6默认 o
    '丨': 'l',
    '木': 'm', # jd6默认 v
    '日': 'o', # jd6默认 oi
    '丿': 'p',
    # q
    '人': 'r', # jd6默认 r
    '土': 't', # jd6默认 vo
    '士': 't', # jd6默认 vo
    # u=sh
    '水': 'u', # jd6默认 a
    '十': 'x', # jd6默认 ui
    '乛': 'v',
    '月': 'y', # jd6默认 u
}

def get_chaizi(path):
    with open(path, 'r', encoding='utf-8') as f:
        chaizi = {}
        for line in f:
            line = line.rstrip()
            if not line or line.startswith('#'):
                continue
            char, parts = line.split('\t')[:2]
            if char not in standard or parts == '未知':
                continue
            chaizi[char] = parts
        return chaizi

chaizi_jd6 = get_chaizi('./dicts/chaizi_jd6.txt')
chaizi_jd6['搬'] = "手丿丿乛"
xcodes = {key: [''.join([parts[c] if len(parts[c]) == 1 else to_double(parts[c]) for c in value])] for key, value in chaizi_jd6.items()}

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
...

"""

with open('../zrm.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2, open('dicts/41448.dict.yaml') as f3:
    f1.write(predefined)
    lines = chain(f2, f3) if enable_40k else f2
    for line in lines:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in xcodes:
                if len(xcodes[char]) > 1:
                    multiple.add(char)
                for code in xcodes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                f1.write(f"{char}\t{to_double(pinyin)};\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

1129个字的辅助码缺失：{'耇', '虒', '潟', '𬭚', '𬶟', '祇', '巇', '䓛', '酂', '烔', '𫞩', '䢼', '殣', '嘚', '囷', '𫐄', '𬤊', '垏', '垞', '幪', '珖', '潏', '䐃', '䣘', '溦', '𫘜', '珕', '洓', '濩', '祂', '酺', '翀', '琀', '翯', '糵', '𤫉', '瑖', '㕮', '穙', '嬿', '蛃', '赗', '瑢', '玓', '㭕', '禋', '玙', '㙍', '溇', '𪨊', '厾', '扊', '𬣳', '𫖮', '𬶋', '𬍡', '瑓', '䢺', '璠', '𬺓', '耤', '玕', '嶦', '骦', '翈', '洑', '牻', '惙', '𬘬', '焞', '犨', '峿', '纴', '巉', '旵', '烜', '泃', '𬭛', '脿', '迺', '腨', '𤩽', '蝲', '盉', '滪', '㴔', '䗛', '衃', '𬘘', '𬶮', '喤', '暶', '𬬿', '燋', '靰', '愃', '苧', '𬜯', '倓', '垱', '𬬮', '嫪', '鹀', '勚', '蔊', '梌', '棤', '禘', '䓖', '鄌', '苾', '烶', '芔', '崿', '珷', '吽', '㑊', '哃', '蒱', '𨱇', '𫟷', '挼', '硿', '𫵷', '藠', '蓇', '𬳶', '劄', '𬤝', '㺄', '摛', '颎', '䴓', '洈', '𡐓', '翷', '洣', '郪', '屮', '鞨', '嵁', '蒐', '鹴', '宧', '晙', '𬍤', '疢', '墘', '蕰', '鬘', '煃', '峱', '捯', '𬶠', '斶', '𬭤', '腒', '𬇙', '涐', '簉', '韨', '鼫', '禛', '浉', '畯', '屼', '𬇕', '榃', '暕', '䏝', '𨱏', '硁', '耏', '暅', '穄', '罍', '圲', '沺', '堾', '肸', '𫠜', '婤', '徛', '筦', '龂', '訄', '炣', '哢', '峧', '皦', '袗', '峂', '淯', '欻', '䲠', '玱', '喆', '

重码

In [12]:
code2chars = defaultdict(set)
charcode2freq = dict()

with open('../zrm.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            code2chars[code].add(char)
            charcode2freq[char + code] = int(freq)

with open('../zrm.dict.yaml', 'r') as file:
    lines = file.readlines()

with open('../zrm.dict.yaml', 'r') as file:
    for line in lines:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            if len(code2chars[code]) > 1:
                print(code2chars[code])
        #         fs = [charcode2freq[c + code] for c in code2chars[code]]
        #         fs.sort(reverse=True)
        #         code = code + fs.index(int(freq)) * 'j'
        #         line = '\t'.join([char, code, freq])
        # file.write(line + '\n')

{'䏲', '䴘', '媞', '薙', '擿', '遆', '𫘨', '趯', '瑅'}
{'卻', '埆', '𬒈', '碏'}
{'鞳', '阘', '牠', '褟', '鿎', '祂'}
{'葖', '㻬', '梌', '稌', '凃', '𬳿', '腯'}
{'翛', '涍', '𫍲', '洨', '滧', '敩', '蟏', '咲'}
{'梣', '骎', '螓'}
{'衃', '珮'}
{'㬊', '貆', '睆', '荁', '峘', '𬘫', '𤩽', '澴'}
{'㨃', '祋'}
{'滆', '鿔'}
{'麑', '薿', '𫠜', '𫐐', '𨺙', '鿭'}
{'朸', '𫵷', '凓', '浰', '醨', '㰀', '珕', '𫟷', '浬', '𬍛'}
{'瑱', '鿬', '沺', '湉', '晪', '黇', '盷', '淟'}
{'瑷', '叆'}
{'瑷', '叆'}
{'螯', '嶅'}
{'螯', '嶅'}
{'粑', '羓'}
{'蚆', '岜'}
{'蚆', '岜'}
{'粑', '羓'}
{'瘢', '瓣'}
{'瘢', '瓣'}
{'棒', '梆'}
{'棒', '梆'}
{'鲍', '鸨', '饱'}
{'鲍', '鸨', '饱'}
{'鲍', '鸨', '饱'}
{'辟', '襞'}
{'吡', '哔'}
{'吡', '哔'}
{'荜', '芘'}
{'萆', '蓖'}
{'秕', '筚'}
{'秕', '筚'}
{'荜', '芘'}
{'辟', '襞'}
{'萆', '蓖'}
{'秘', '馝'}
{'秘', '馝'}
{'遍', '褊'}
{'遍', '褊'}
{'𨚕', '萹', '抃'}
{'𨚕', '萹', '抃'}
{'𨚕', '萹', '抃'}
{'瀌', '脿', '藨', '幖', '儦'}
{'瀌', '脿', '藨', '幖', '儦'}
{'瀌', '脿', '藨', '幖', '儦'}
{'瀌', '脿', '藨', '幖', '儦'}
{'瀌', '脿', '藨', '幖', '儦'}
{'昺', '蛃'}
{'昺', '蛃'}
{'钵', '镈'}
{'哱', '袯', '𬭛', '嶓', '砵', '僰', '浡', '欂'}
{'钵', '镈'}
{'哱', '袯', '𬭛',

词库转换准备

In [6]:
zrm_phones = defaultdict(list)
zrm_codes = {}
zrm_path = '../zrm.dict.yaml'

with open(zrm_path) as f:
  for line in f:
    if len(line.split('\t')) != 3:
      continue
    char, code = line.rstrip().split('\t')[:2]
    yin, xing = code.split(';')
    zrm_phones[char].append(yin)
    zrm_codes[char] = xing

def convert_pinyin(infile, outfile, override_freq=None):
  with open(infile) as f1, open(outfile, 'w') as f2:
    for line in f1:
      if not re.match(r'^.+\t.+(\t\d+)*$', line):
        f2.write(line)
        continue
      parts = line.rstrip().split('\t')
      chars, phones = split_zh_en(parts[0]), parts[1].split(' ')
      if len(chars) < 1:  # 排除单字
        continue

      if any(is_en(char) for char in chars): # 存在英语
        doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones)) # 存在英语的时候加形码会导致输入框提示异常
        doubled_code = ''.join(doubled)
      elif all((char in zrm_phones and to_double(phone) in zrm_phones[char]) for char,phone in zip(chars, phones)):
        doubled = (phone if is_en(char) else to_double(phone) + ';' + zrm_codes[char] for char,phone in zip(chars, phones))
        # doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ' '.join(doubled)
      else: # 存在词库读音与字库设置读音不符
        continue

      if len(parts) < 3:
        if override_freq:
          f2.write(''.join(chars) + '\t' + doubled_code + '\t' + str(override_freq) + '\n')
        else:
          f2.write(''.join(chars) + '\t' + doubled_code + '\n')
      else:
        freq = parts[2]
        f2.write(''.join(chars) + '\t' + doubled_code + '\t' + freq + '\n')

自定义词库转换

In [7]:
convert_pinyin('./dicts/custom.dict.yaml', '../dicts/custom.dict.yaml', override_freq=10000)

其他词库转换

In [16]:
convert_pinyin('./dicts/base.dict.yaml', '../dicts/base.dict.yaml')
convert_pinyin('./dicts/ext.dict.yaml', '../dicts/ext.dict.yaml')
convert_pinyin('./dicts/others.dict.yaml', '../dicts/others.dict.yaml')