通用函数

In [1]:
import re
from collections import defaultdict
from itertools import chain

first = {'ch': 'i',
         'sh': 'u',
         'zh': 'v'}

second = {
    'ua': 'w',
    'ei': 'z',
    'e': 'e',
    'ou': 'b',
    'iu': 'q',
    've': 't',
    'ue': 't',
    'u': 'u',
    'i': 'i',
    'o': 'o',
    'uo': 'o',
    'ie': 'x',
    'a': 'a',
    'ong': 's',
    'iong': 's',
    'ai': 'l',
    'ing': 'y',
    'uai': 'y',
    'ang': 'h',
    'uan': 'r',
    'an': 'j',
    'en': 'f',
    'ia': 'w',
    'iang': 'd',
    'uang': 'd',
    'eng': 'g',
    'in': 'n',
    'ao': 'k',
    'v': 'v',
    'ui': 'v',
    'un': 'p',
    'iao': 'c',
    'ian': 'm'
}

# 特殊，只有䪨母，且总长不过 3
# 零声母，单双三䪨母
special = {
    'a': 'aa',
    'ai': 'ai',
    'an': 'an',
    'ang': 'ah',
    'ao': 'ao',
    'e': 'ee',
    'ei': 'ei',
    'en': 'en',
    'er': 'er',
    'o': 'oo',
    'ou': 'ou',

    # 唵嘛呢叭咪吽
    'ong': 'os',
    # 嗯
    'eng': 'eg',
}

def to_double(s: str) -> str:
    """
    传入单汉字的全拼编码，反回其自然码双拼编码

    :param s: 全拼编码
    :return: 双拼编码
    """
    new_s = ''
    # 特列情况: 无声母，a, an, ang, n
    if len(s) <= 3 and s[0] in ['a', 'e', 'o']:
        if s in special.keys():
            return special[s]
        else:
            print('未知情况1', s)

    # 一般: 声母 + 䪨母

    # 最长的情况：first+second，例如 chuang = ch + uang
    # 2 位声母 + 最多 4 位韵母
    if s[:2] in first.keys():
        new_s += first[s[:2]]
        # 最多 4 位䪨母
        if s[2:] in second.keys():
            new_s += second[s[2:]]
    # 较短的情况：second+second，例如 h uang, x iang
    # 1位声母 + 最多 4 位䪨母
    else:
        new_s += s[0]  # 1位声母
        # 最多 4 位䪨母
        if s[1:] in second.keys():
            new_s += second[s[1:]]
        else:
            new_s += s[1:]

    return new_s


pattern = re.compile(r'^[a-zA-Z"]+$')
def is_en(text):
    return bool(pattern.match(text))

# 中文分成每一个字，英文分成一个单词
def split_zh_en(text):
    pattern = re.compile(r'[^a-zA-Z]|[a-zA-Z]+')
    return pattern.findall(text)

构造自然码+形码码表

In [2]:
# 是否使用41448大字表
enable_40k = False

standard = set()

with open('dicts/8105.dict.yaml', encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, freq = line.split()
            standard.add(char)

if enable_40k:
    with open('dicts/41448.dict.yaml', encoding='utf-8') as f:
        for line in f:
            line = line.rstrip()
            if re.match(r'^.\t\w+\t\d+$', line):
                char, _, freq = line.split()
                if char not in standard:
                    standard.add(char)

# 鹤形
# xcodes = defaultdict(set)
# with open('./dicts/flypy_flypy.dict.yaml', 'r', encoding='utf-8') as f:
#     for line in f:
#         line = line.rstrip()
#         if re.match(r'^.\t[\w\[]+.*$', line):
#             char, code = line.split('\t')[:2]
#             if char not in standard:
#                 continue
#             code = code[code.rfind('[')+1:]
#             if code:
#                 xcodes[char].add(code)

# 键道6
parts = {
    '一': 'a', # jd6默认 v
    '贝': 'b', # jd6默认 ao
    '草': 'c', # jd6默认 ii
    '丶': 'd',
    # e
    '手': 'f', # jd6默认 iu
    # g
    # i=ch
    '金': 'j', # jd6默认 io
    '口': 'k', # jd6默认 o
    '丨': 'l',
    '木': 'm', # jd6默认 v
    '日': 'o', # jd6默认 oi
    '丿': 'p',
    # q
    '人': 'r', # jd6默认 r
    '土': 't', # jd6默认 vo
    '士': 't', # jd6默认 vo
    # u=sh
    '水': 'u', # jd6默认 a
    '十': 'x', # jd6默认 ui
    '乛': 'v',
    '月': 'y', # jd6默认 u
}

def get_chaizi(path):
    with open(path, 'r', encoding='utf-8') as f:
        chaizi = {}
        for line in f:
            line = line.rstrip()
            if not line or line.startswith('#'):
                continue
            char, parts = line.split('\t')[:2]
            if char not in standard or parts == '未知':
                continue
            chaizi[char] = parts
        return chaizi

chaizi_jd6 = get_chaizi('./dicts/chaizi_jd6.txt')
chaizi_jd6['搬'] = "手丿丿乛"
xcodes = {key: [''.join([parts[c] if len(parts[c]) == 1 else to_double(parts[c]) for c in value])] for key, value in chaizi_jd6.items()}

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
  - ./dicts/fix
...

"""

with open('../zrm.dict.yaml', 'w', encoding='utf-8') as f1, open('dicts/8105.dict.yaml', encoding='utf-8') as f2, open('dicts/41448.dict.yaml', encoding='utf-8') as f3:
    f1.write(predefined)
    lines = chain(f2, f3) if enable_40k else f2
    for line in lines:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in xcodes:
                if len(xcodes[char]) > 1:
                    multiple.add(char)
                for code in xcodes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                f1.write(f"{char}\t{to_double(pinyin)};\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

1119个字的辅助码缺失：{'屼', '畯', '𦒍', '𫘧', '秾', '爟', '靰', '𬣳', '帡', '琭', '𬳽', '瘆', '砄', '䥽', '嶦', '纴', '蔃', '𬀪', '傉', '㑇', '郈', '枅', '䗛', '珷', '冮', '滧', '梽', '𨚕', '婤', '夬', '崡', '𬭁', '斶', '齇', '葴', '𬶟', '酦', '桯', '朏', '朸', '蓇', '甦', '圢', '咥', '㠇', '爔', '郃', '𪤗', '垺', '婍', '耤', '腨', '嬥', '洢', '泃', '祏', '榰', '翃', '薿', '萩', '瑨', '菉', '堉', '漴', '祋', '洨', '𬭩', '夐', '廋', '潾', '𬭶', '柷', '腒', '粿', '𬘭', '赪', '珫', '蹐', '㨃', '蒄', '踒', '𫰛', '翙', '澽', '咍', '𬬹', '𬨎', '炆', '𬸚', '僔', '璠', '淜', '璆', '瑱', '坋', '觟', '簉', '薸', '涘', '郚', '狉', '膙', '𬬿', '𪨊', '璘', '繄', '鞬', '矼', '𠳐', '叇', '䴗', '盷', '惙', '髎', '峂', '睎', '埵', '觭', '焞', '蝘', '硁', '靬', '禤', '硚', '凘', '鄜', '𬶨', '笯', '堌', '玭', '龢', '稌', '忳', '藦', '杄', '鼫', '浛', '荖', '璲', '𪾢', '萣', '𬇹', '艎', '浲', '穟', '昡', '墕', '橞', '埪', '媆', '𫖳', '拤', '伣', '璪', '𫠜', '薳', '珖', '䦃', '扞', '𬍛', '刬', '篯', '炁', '佺', '栒', '哢', '剅', '蟏', '鹲', '苧', '鞳', '汧', '梌', '𫄸', '𫄷', '堨', '𥖨', '垕', '癿', '𫶇', '㭎', '䗪', '姞', '欸', '鹠', '璱', '朓', '燏', '湑', '鹢', '暅', '𬤝', '姶', '䁖', '蚲', '胣', '鬒', '

重码

In [3]:
code2chars = defaultdict(set)
charcode2freq = dict()

with open('../zrm.dict.yaml', encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            code2chars[code].add(char)
            charcode2freq[char + code] = int(freq)

with open('../zrm.dict.yaml', 'r', encoding='utf-8') as file:
    lines = file.readlines()

with open('../zrm.dict.yaml', 'r', encoding='utf-8') as file:
    for line in lines:
        line = line.rstrip()
        if re.match(r'^.\t[a-z;]+\t\d+$', line):
            char, code, freq = line.split()
            if len(code2chars[code]) > 1:
                print(code2chars[code])
        #         fs = [charcode2freq[c + code] for c in code2chars[code]]
        #         fs.sort(reverse=True)
        #         code = code + fs.index(int(freq)) * 'j'
        #         line = '\t'.join([char, code, freq])
        # file.write(line + '\n')

{'瑷', '叆'}
{'瑷', '叆'}
{'嶅', '螯'}
{'嶅', '螯'}
{'粑', '羓'}
{'粑', '羓'}
{'蚆', '岜'}
{'蚆', '岜'}
{'瘢', '瓣'}
{'瘢', '瓣'}
{'梆', '棒'}
{'梆', '棒'}
{'鲍', '鸨', '饱'}
{'鲍', '鸨', '饱'}
{'鲍', '鸨', '饱'}
{'哔', '吡'}
{'哔', '吡'}
{'芘', '荜'}
{'萆', '蓖'}
{'辟', '襞'}
{'萆', '蓖'}
{'芘', '荜'}
{'辟', '襞'}
{'筚', '秕'}
{'筚', '秕'}
{'馝', '秘'}
{'馝', '秘'}
{'褊', '遍'}
{'萹', '抃', '𨚕'}
{'萹', '抃', '𨚕'}
{'褊', '遍'}
{'萹', '抃', '𨚕'}
{'藨', '脿', '幖', '儦', '瀌'}
{'藨', '脿', '幖', '儦', '瀌'}
{'藨', '脿', '幖', '儦', '瀌'}
{'藨', '脿', '幖', '儦', '瀌'}
{'藨', '脿', '幖', '儦', '瀌'}
{'蛃', '昺'}
{'蛃', '昺'}
{'镈', '钵'}
{'镈', '钵'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'僰', '袯', '浡', '哱', '𬭛', '砵', '嶓', '欂'}
{'𬷕', '埗', '蔀'}
{'𬷕', '埗', '蔀'}
{'𬷕', '埗', '蔀'}
{'𥕢', '芔', '屮'}
{'𥕢', '芔', '屮'}
{'𥕢', '芔', '屮'}
{'垞', '𥻗', '侘', '嵖',

词库转换准备

In [4]:
zrm_phones = defaultdict(list)
zrm_codes = {}
zrm_path = '../zrm.dict.yaml'

with open(zrm_path, encoding='utf-8') as f:
  for line in f:
    if len(line.split('\t')) != 3:
      continue
    char, code = line.rstrip().split('\t')[:2]
    yin, xing = code.split(';')
    zrm_phones[char].append(yin)
    zrm_codes[char] = xing

def convert_pinyin(infile, outfile, override_freq=None):
  with open(infile, encoding='utf-8') as f1, open(outfile, 'w', encoding='utf-8') as f2:
    for line in f1:
      if not re.match(r'^.+\t.+(\t\d+)*$', line):
        f2.write(line)
        continue
      parts = line.rstrip().split('\t')
      chars, phones = split_zh_en(parts[0]), parts[1].split(' ')
      if len(chars) < 1:  # 排除单字
        continue

      if any(is_en(char) for char in chars): # 存在英语
        doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones)) # 存在英语的时候加形码会导致输入框提示异常
        doubled_code = ''.join(doubled)
      elif all((char in zrm_phones and to_double(phone) in zrm_phones[char]) for char,phone in zip(chars, phones)):
        doubled = (phone if is_en(char) else to_double(phone) + ';' + zrm_codes[char] for char,phone in zip(chars, phones))
        # doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ' '.join(doubled)
      else: # 存在词库读音与字库设置读音不符
        continue

      if len(parts) < 3:
        if override_freq:
          f2.write(''.join(chars) + '\t' + doubled_code + '\t' + str(override_freq) + '\n')
        else:
          f2.write(''.join(chars) + '\t' + doubled_code + '\n')
      else:
        freq = parts[2]
        f2.write(''.join(chars) + '\t' + doubled_code + '\t' + freq + '\n')

自定义词库转换

In [5]:
convert_pinyin('./dicts/custom.dict.yaml', '../dicts/custom.dict.yaml', override_freq=10000)
convert_pinyin('./dicts/fix.dict.yaml', '../dicts/fix.dict.yaml')

其他词库转换

In [7]:
convert_pinyin('./dicts/base.dict.yaml', '../dicts/base.dict.yaml')
convert_pinyin('./dicts/ext.dict.yaml', '../dicts/ext.dict.yaml')