通用函数

In [69]:
import re
from collections import defaultdict

first = {'ch': 'i',
         'sh': 'u',
         'zh': 'v'}

second = {
    'ua': 'w',
    'ei': 'z',
    'e': 'e',
    'ou': 'b',
    'iu': 'q',
    've': 't',
    'ue': 't',
    'u': 'u',
    'i': 'i',
    'o': 'o',
    'uo': 'o',
    'ie': 'x',
    'a': 'a',
    'ong': 's',
    'iong': 's',
    'ai': 'l',
    'ing': 'y',
    'uai': 'y',
    'ang': 'h',
    'uan': 'r',
    'an': 'j',
    'en': 'f',
    'ia': 'w',
    'iang': 'd',
    'uang': 'd',
    'eng': 'g',
    'in': 'n',
    'ao': 'k',
    'v': 'v',
    'ui': 'v',
    'un': 'p',
    'iao': 'c',
    'ian': 'm'
}

# 特殊，只有䪨母，且总长不过 3
# 零声母，单双三䪨母
special = {
    'a': 'aa',
    'ai': 'ai',
    'an': 'an',
    'ang': 'ah',
    'ao': 'ao',
    'e': 'ee',
    'ei': 'ei',
    'en': 'en',
    'er': 'er',
    'o': 'oo',
    'ou': 'ou',

    # 唵嘛呢叭咪吽
    'ong': 'os'
}

def to_double(s: str) -> str:
    """
    传入单汉字的全拼编码，反回其自然码双拼编码

    :param s: 全拼编码
    :return: 双拼编码
    """
    new_s = ''
    # 特列情况: 无声母，a, an, ang
    if len(s) <= 3 and s[0] in ['a', 'e', 'o']:
        if s in special.keys():
            return special[s]
        else:
            print('未知情况1', s)

    # 一般: 声母 + 䪨母

    # 最长的情况：first+second，例如 chuang = ch + uang
    # 2 位声母 + 最多 4 位韵母
    if s[:2] in first.keys():
        new_s += first[s[:2]]
        # 最多 4 位䪨母
        if s[2:] in second.keys():
            new_s += second[s[2:]]
    # 较短的情况：second+second，例如 h uang, x iang
    # 1 位声母 + 最多 4 位䪨母
    else:
        new_s += s[0]  # 1 位声母
        # 最多 4 位䪨母
        if s[1:] in second.keys():
            new_s += second[s[1:]]
        else:
            new_s += s[1:]

    return new_s


pattern = re.compile(r'^[a-zA-Z"]+$')
def is_en(text):
    return bool(pattern.match(text))

# 中文分成每一个字，英文分成一个单词
def split_zh_en(text):
    pattern = re.compile(r'[^a-zA-Z]|[a-zA-Z]+')
    return pattern.findall(text)

构造自然码+鹤形码表

In [None]:
standard = set()

with open('dicts/8105.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, _ = line.split()
            standard.add(char)

xhe_codes = defaultdict(set)

with open('dicts/flypy_flypy.dict.yaml', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t[\w\[]+.*$', line):
            char, code = line.split('\t')[:2]
            if char not in standard:
                continue
            code = code[code.rfind('[')+1:]
            if code:
                xhe_codes[char].add(code)

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
...

"""

with open('../zrm.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2:
    f1.write(predefined)
    for line in f2:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in xhe_codes:
                if len(xhe_codes[char]) > 1:
                    multiple.add(char)
                for code in xhe_codes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                f1.write(f"{char}\t{to_double(pinyin)}\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

构造自然码+易用码表

In [70]:
standard = set()

with open('dicts/8105.dict.yaml') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, _, _ = line.split()
            standard.add(char)

yx_codes = defaultdict(set)

with open('dicts/手心辅易学码9.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        if re.match(r'^.+=[a-z]{2}$', line):
            char, code = line.split('=')[:2]
            if char not in standard:
                continue
            if code:
                yx_codes[char].add(code)

missing = set()
multiple = set()
predefined = """---
name: zrm
version: "1.0"
sort: by_weight  
use_preset_vocabulary: true
min_phrase_weight: 100
import_tables:
  - ./dicts/custom
  - ./dicts/base
  - ./dicts/ext
...

"""

with open('../zrm.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2:
    f1.write(predefined)
    for line in f2:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, freq = line.split()
            if char in yx_codes:
                if len(yx_codes[char]) > 1:
                    multiple.add(char)
                for code in yx_codes[char]:
                    f1.write(f"{char}\t{to_double(pinyin)};{code}\t{freq}\n")
            else:
                missing.add(char)
                # f1.write(f"{char}\t{to_double(pinyin)};\t{freq}\n")

print(f"{len(missing)}个字的辅助码缺失：{missing}")
print(f"{len(multiple)}个字存在多个辅助码：{multiple}")

1075个字的辅助码缺失：{'疐', '叚', '㬊', '橞', '梣', '畬', '钘', '鹝', '峧', '拤', '㭕', '蘘', '侁', '艹', '鄠', '菍', '轪', '罍', '胠', '燚', '厾', '嬿', '䲟', '楒', '㙍', '揳', '湲', '𥖨', '薙', '媱', '碏', '㫰', '攽', '陑', '熥', '婼', '堨', '偭', '𬞟', '斶', '荁', '焜', '汭', '佽', '鼱', '𬉼', '襫', '琀', '洿', '荙', '峱', '梿', '矞', '䦃', '垙', '𪨶', '扂', '炌', '嶲', '俵', '埆', '暲', '䗪', '鬒', '𬶍', '杕', '肸', '铻', '磜', '惇', '鿍', '阇', '匼', '觟', '鳒', '潽', '𫘜', '𬴃', '屾', '襕', '傒', '荓', '祼', '䓫', '牤', '𫍽', '浲', '䅟', '鸮', '叆', '詟', '岞', '簉', '𠅤', '猇', '珋', '鲹', '蛃', '珺', '椪', '鲀', '𬭎', '黡', '饸', '硊', '柈', '鳁', '䴙', '玘', '蝘', '逴', '卬', '鄑', '橦', '杧', '縠', '𬬭', '磻', '帡', '晅', '滍', '飐', '婳', '觿', '邘', '愐', '蓏', '𬹼', '靬', '牻', '菉', '崡', '浰', '栐', '筀', '嬬', '𫖯', '𫓶', '琟', '僇', '铚', '炆', '琭', '𬬱', '𫍯', '邲', '悢', '鬷', '棪', '屄', '垱', '菂', '𬭸', '郿', '亹', '坥', '鳤', '偡', '鳀', '淜', '𬮿', '𬳶', '𬪩', '湜', '𬭚', '蒻', '甡', '䢺', '纴', '𫚕', '㳇', '倴', '烠', '睄', '滃', '𫭼', '𬊈', '疢', '欂', '偁', '爟', '飔', '潵', '咍', '訄', '䥽', '辿', '蒟', '𨭉', '嵁', '堃', '芃', '溹', '朓', '㬎', '嫕', '𬭯', '

构造反查表（弃用）

In [None]:
predefined = """---
name: reverse
version: "1.0"
sort: original  
use_preset_vocabulary: false
columns:
  - text
  - code
...

"""

with open('../reverse.dict.yaml', 'w') as f1, open('dicts/8105.dict.yaml') as f2:
    f1.write(predefined)
    chars = set()
    for line in f2:
        line = line.rstrip()
        if re.match(r'^.\t\w+\t\d+$', line):
            char, pinyin, _ = line.split()
            if char not in chars:
              chars.add(char)
              f1.write(f"{char}\t{to_double(pinyin)}\n")

词库转换准备

In [71]:
zrm_phones = defaultdict(list)
zrm_codes = {}
zrm_path = '../zrm.dict.yaml'

with open(zrm_path) as f:
  for line in f:
    if len(line.split('\t')) != 3:
      continue
    char, code = line.rstrip().split('\t')[:2]
    yin, xing = code.split(';')
    zrm_phones[char].append(yin)
    zrm_codes[char] = xing

def convert_pinyin(infile, outfile):
  with open(infile) as f1, open(outfile, 'w') as f2:
    for line in f1:
      if not re.match(r'^.+\t.+(\t\d+)*$', line):
        f2.write(line)
        continue
      parts = line.rstrip().split('\t')
      chars, phones = split_zh_en(parts[0]), parts[1].split(' ')
      if len(chars) < 1:  # 排除单字
        continue

      if any(is_en(char) for char in chars): # 存在英语
        doubled = (phone if is_en(char) else to_double(phone) for char,phone in zip(chars, phones))
        doubled_code = ''.join(doubled)
      elif all((char in zrm_phones and to_double(phone) in zrm_phones[char]) for char,phone in zip(chars, phones)):
        doubled = (phone if is_en(char) else to_double(phone) + ';' + zrm_codes[char] for char,phone in zip(chars, phones))
        doubled_code = ' '.join(doubled)
      else: # 存在词库读音与字库设置读音不符
        continue

      if len(parts) < 3:
        f2.write(''.join(chars) + '\t' + doubled_code + '\n')
      else:
        freq = parts[2]
        f2.write(''.join(chars) + '\t' + doubled_code + '\t' + freq + '\n')

搜狗词库转换

In [None]:
!python ./scel.py

In [None]:
convert_pinyin('./out/luna_pinyin.sogou.dict.yaml', '../dicts/sogou.dict.yaml')

自定义词库转换

In [72]:
convert_pinyin('./dicts/custom.dict.yaml', '../dicts/custom.dict.yaml')

其他词库转换

In [73]:
convert_pinyin('./dicts/base.dict.yaml', '../dicts/base.dict.yaml')
convert_pinyin('./dicts/ext.dict.yaml', '../dicts/ext.dict.yaml')