## 1. 写一个方法，实现从一个字串经过哪些操作后能得到另一个字串？

In [1]:
from functools import lru_cache
import copy

In [3]:
@lru_cache(maxsize=2**10) # 调用下面的函数时，若找到相同的输入，那么直接返回相同的输出结果
def edit_distance(string1, string2):
    
    if len(string1) == 0: return len(string2), []
    if len(string2) == 0: return len(string1), []
    
    tail_s1 = string1[-1]
    tail_s2 = string2[-1]
    
    min_distance1, operations1 = edit_distance(string1[:-1], string2)
    min_distance1 += 1
    operations1 = copy.copy(operations1)
    operations1.append('DEL p{}_{}'.format(len(string1) - 1 , tail_s1),)  
    
    min_distance2, operations2 = edit_distance(string1, string2[:-1])
    operations2 = copy.copy(operations2)
    operations2.append('ADD p{}_{}'.format(len(string2) - 1 , tail_s2), )  
    min_distance2 += 1
    
    if tail_s1 == tail_s2:
        min_distance3, operations3 = edit_distance(string1[:-1], string2[:-1])
    else:
        min_distance3, operations3 = edit_distance(string1[:-1], string2[:-1])
        min_distance3 += 1
        operations3 = copy.copy(operations3)
        operations3.append('SUB p{}_{} => p{}_{}'.format(len(string1) - 1 ,tail_s1, len(string2) - 1, tail_s2))
    
    candidates = [
        (min_distance1, operations1),
        (min_distance2, operations2),
        (min_distance3, operations3),
    ] 
    
    min_distance, operations = min(candidates, key=lambda x: x[0])
    
    return min_distance, operations


In [4]:
edit_distance('ABCDECG','ABCCEF')

(3, ['SUB p3_D => p3_C', 'SUB p5_C => p5_F', 'DEL p6_G'])

In [5]:
edit_distance('特朗普称来自中国物资非常好','特朗普说中国物资都是垃圾')

(7,
 ['SUB p3_称 => p3_说',
  'DEL p4_来',
  'DEL p5_自',
  'SUB p10_非 => p8_都',
  'SUB p11_常 => p9_是',
  'SUB p12_好 => p10_垃',
  'ADD p11_圾'])

## 2. 如何让输入法输入字符串后在不带空格的时候完成修正？如何完成拼音的自动分割？

提示：使用第一节课提到的语言模型。

In [6]:
# import pinyin # 不知道为什么，安装后不能使用

先找个好用的拼音库：

In [3]:
from xpinyin import Pinyin
pinyin = Pinyin()

In [8]:
pinyin.get_pinyin('我们是共产主义接班人')

# 默认以'-'为分割符
a = pinyin.get_pinyin(u"上海")
print(a)

# 默认以'-'分割,显示音调
b = pinyin.get_pinyin(u'上海', tone_marks='marks')
print(b)

# 默认以'-'分割,显示音调
b = pinyin.get_pinyin(u'上海', tone_marks='numbers')
print(b)

# 删除分割符
c = pinyin.get_pinyin(u'上海', '')
print(c)

# 设置空白格为分割符
d = pinyin.get_pinyin(u'上海', ' ')
print(d)

d1 = pinyin.get_initial(u'上')
print(d1)

d2 = pinyin.get_initials(u'上海')
print(d2)

d3 = pinyin.get_initials(u'上海', u'')
print(d3)

d4 = pinyin.get_initials(u'上海', u' ')
print(d4)

wordvalue = '上海'
s = pinyin.get_initials(wordvalue, u'').lower()
print(s)

# 多音字
print(pinyin.get_pinyin('重', tone_marks='numbers'))
print(pinyin.get_pinyin('和', tone_marks='numbers'))

shang-hai
shàng-hǎi
shang4-hai3
shanghai
shang hai
S
S-H
SH
S H
sh
zhong4
he2


In [2]:
from pypinyin import pinyin, lazy_pinyin, Style, load_phrases_dict, load_single_dict
from pypinyin.style import register

In [5]:
print(pinyin('你好'))  # [['nǐ'], ['hǎo']]
print(pinyin('中心中心', heteronym=True))  # 启用多音字模式  # [['zhōng', 'zhòng'], ['xīn']]
print(pinyin('中心中心', style=Style.FIRST_LETTER))  # 设置拼音风格，第一个字母 [['z'], ['x']]
print(pinyin('中心中心', style=Style.TONE2, heteronym=True))  # [['zho1ng', 'zho4ng'], ['xi1n']]
print(lazy_pinyin('中心'))  # 不考虑多音字的情况 # ['zhong', 'xin']

[['nǐ'], ['hǎo']]
[['zhōng', 'zhòng'], ['xīn'], ['zhōng', 'zhòng'], ['xīn']]
[['z'], ['x'], ['z'], ['x']]
[['zho1ng', 'zho4ng'], ['xi1n'], ['zho1ng', 'zho4ng'], ['xi1n']]
['zhong', 'xin']


In [6]:
##########处理不包含拼音的字符
# default (默认行为): 不做任何处理，原样返回:
print(lazy_pinyin('你好☆d☆你好'))  # ['ni', 'hao', '☆☆']
# ignore : 忽略该字符
print(lazy_pinyin('你好☆c☆你好', errors='ignore'))  # ['ni', 'hao']
# replace : 替换为去掉 \u 的 unicode 编码
print(lazy_pinyin('你好☆b☆你好', errors='replace'))  # ['ni', 'hao', '26062606']
# callable 对象 : 提供一个回调函数，接受无拼音字符(串)作为参数, 支持的返回值类型: unicode 或 list ([unicode, …]) 或 None 。
print(lazy_pinyin('你好☆a☆你好', errors=lambda x: 'star'))  # ['ni', 'hao', 'star']

########### 自定义拼音库
print(lazy_pinyin('还没', style=Style.TONE2))
load_phrases_dict({'桔子': [['jú'], ['zǐ']]})  # 增加 "桔子" 词组，可以自己定义
print(lazy_pinyin('桔子', style=Style.TONE2))

load_single_dict({ord('还'): 'hái,huán'})  # 调整 "还" 字的拼音顺序
print(lazy_pinyin('还没', style=Style.TONE2))

['ni', 'hao', '☆d☆', 'ni', 'hao']
['ni', 'hao', 'ni', 'hao']
['ni', 'hao', '2606622606', 'ni', 'hao']
['ni', 'hao', 'star', 'ni', 'hao']
['ha2i', 'me2i']
['ju2', 'zi3']
['ha2i', 'me2i']


In [7]:
###########自定义拼音风格
@register('kiss')
def kiss(mypinyin, **kwargs):
    return '😘 {0}'.format(mypinyin)

print(lazy_pinyin('么么哒', style='kiss'))
print(lazy_pinyin('么么哒'))

['😘 me', '😘 me', '😘 dá']
['me', 'me', 'da']


还是pypinyin比较好，功能齐全！

接下来实现功能：

思路一： 训练时目标是构建一个字典，key是拼音，value是子字典（key是可能的词，value是 词库词频和用户输入次数等的数据对象），
训练时进行容错处理，应用时词选取根据词频和用户数据等信息计算后进行排序并显示给用户。

训练过程：

1. 用分词工具把语料切割成词，然后去重并统计每个词出现的次数。

2. 过滤掉非汉字字符。

3. 循环所有字和词的拼音以及容错性处理的拼音及它们的声母，建立这样一个字典。

4*. 字典的持久化保存。

推理过程：

1. 如果用户输入的拼音能在字典中找到则直接计算出结果显示。

2. 如果用户输入的拼音太长则使用动态规划对拼音进行切割后执行1再拼接。

这一思路是本学生感觉最容易实现的一种方式。训练过程容易实现，容错容易处理，推理阶段计算实时高效，但内存可能消耗较大。

In [3]:
# 分词
import thulac
import pandas as pd
from collections import Counter
import numpy as np
import re
import math

wordCuter = thulac.thulac(seg_only=True)

Model loaded succeed


In [4]:
def getCsvArticles(filename, contentColName, encoding='gb18030'):
    content = pd.read_csv(filename, encoding=encoding)
    articles = content[contentColName].tolist()
    return articles

content = pd.read_csv('../Lesson10/Data/新华社数据.csv', encoding='gb18030')
articles = content['content'].tolist()

In [38]:
# 分词过程
words = []
for i, article in enumerate(articles):
    if not isinstance(article, str):
        continue
    if i % 1000 == 999:
        print('{0}/{1}'.format(i + 1, len(articles)))
    _words = [ t1  for t1,t2 in wordCuter.cut(article, text=False)] 
    words += _words
    
wordsCounter = Counter(words)
print(wordsCounter)

1000/89611
2000/89611
3000/89611
4000/89611
5000/89611
6000/89611
7000/89611
8000/89611
12000/89611
13000/89611
14000/89611
15000/89611
16000/89611
17000/89611
18000/89611
19000/89611
20000/89611
21000/89611
22000/89611
23000/89611
24000/89611
25000/89611
26000/89611
27000/89611
28000/89611
29000/89611
30000/89611
31000/89611
32000/89611
33000/89611
34000/89611
35000/89611
36000/89611
37000/89611
38000/89611
39000/89611
40000/89611
41000/89611
42000/89611
43000/89611
44000/89611
45000/89611
46000/89611
47000/89611
48000/89611
49000/89611
50000/89611
51000/89611
52000/89611
53000/89611
54000/89611
55000/89611
56000/89611
57000/89611
58000/89611
59000/89611
60000/89611
61000/89611
62000/89611
63000/89611
64000/89611
65000/89611
66000/89611
67000/89611
68000/89611
69000/89611
70000/89611
71000/89611
72000/89611
73000/89611
74000/89611
75000/89611
76000/89611
77000/89611
78000/89611
79000/89611
80000/89611
81000/89611
82000/89611
83000/89611
84000/89611
85000/89611
86000/89611
87000/89611


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [39]:
# 过滤掉非汉字字符

newWordsCounter = {}
for i, (word, wCount) in enumerate(wordsCounter.items()):
    if i % 1000 == 999:
        print('{0}/{1}'.format(i + 1, len(wordsCounter)))

    _words = re.findall('[\u4e00-\u9fa5]+', word)
    if len(_words) == 0 : continue 
    for _word in _words:
        if _word != '':
            if _word in newWordsCounter:
                newWordsCounter[_word] += wCount
            else:
                newWordsCounter[_word] = wCount
            
print(newWordsCounter)

1000/296844
2000/296844
3000/296844
4000/296844
5000/296844
6000/296844
7000/296844
8000/296844
9000/296844
10000/296844
11000/296844
12000/296844
13000/296844
14000/296844
15000/296844
16000/296844
17000/296844
18000/296844
19000/296844
20000/296844
21000/296844
22000/296844
23000/296844
24000/296844
25000/296844
26000/296844
27000/296844
28000/296844
29000/296844
30000/296844
31000/296844
32000/296844
33000/296844
34000/296844
35000/296844
36000/296844
37000/296844
38000/296844
39000/296844
40000/296844
41000/296844
42000/296844
43000/296844
44000/296844
45000/296844
46000/296844
47000/296844
48000/296844
49000/296844
50000/296844
51000/296844
52000/296844
53000/296844
54000/296844
55000/296844
56000/296844
57000/296844
58000/296844
59000/296844
60000/296844
61000/296844
62000/296844
63000/296844
64000/296844
65000/296844
66000/296844
67000/296844
68000/296844
69000/296844
70000/296844
71000/296844
72000/296844
73000/296844
74000/296844
75000/296844
76000/296844
77000/296844
78000/29

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [40]:
import json

In [41]:
# 写入 JSON 数据
with open('data.json', 'w') as f:
    json.dump(newWordsCounter, f)
    

In [None]:
# 读取数据
with open('data.json', 'r') as f:
    newWordsCounter = json.load(f)

In [7]:
# 容错表, key1: 正确的拼音，key2：可能的错误的拼音，value：权重值
faultTolerantMap = {
    'ShengMu':{ # 声母
        'z':{'zh':0.6},
        'c':{'ch':0.6},
        's':{'sh':0.6},
        'zh':{'z':0.6},
        'ch':{'c':0.6},
        'sh':{'s':0.6},
        'x':{'th':0.5},
        'q':{'th':0.4},
        'k':{'q':0.5},
        'j':{'g':0.5},
        'g':{'j':0.5},
    },
    'YunMu':{ # 韵母
        'ong':{'on':0.7, 'og':0.5, 'o':0.3},
        'ang':{'an':0.7, 'ag':0.5},
        'an':{'ong':0.7, 'a':0.3},
        'ing':{'in':0.7, 'i':0.3},
        'iang':{'ian':0.7, 'iag':0.5, 'ia':0.3},
        'iong':{'ion':0.7, 'iog':0.5, 'io':0.5},
        'in':{'ing':0.7, 'i':0.3},
        'eng':{'en':0.7, 'eg':0.5},
        'en':{'eng':0.7},
    },
}

In [14]:
print(str.find('jjjsssjjjiiie', 'jjj'))
print(str.find('jjjsssjjjiiie', 'ss'))
print(str.find('jjjsssjjjiiie', 's'))
print(str.find('jjjsssjjjiiie', 'e'))
print(str.find('jjjsssjjjiiie', 'i'))
print(str.find('jjjsssjjjiiie', 'ii'))
print(str.rfind('jjjsssjjjiiie', 'jjj'))
print(str.rfind('jjjsssjjjiiie', 'ss'))
print(str.rfind('jjjsssjjjiiie', 's'))
print(str.rfind('jjjsssjjjiiie', 'e'))
print(str.rfind('jjjsssjjjiiie', 'i'))
print(str.rfind('jjjsssjjjiiie', 'ii'))
print(re.findall('jjj', 'jjjsssjjjiiie'))
print(re.finditer('jjj', 'jjjsssjjjiiie'))

0
3
3
12
9
9
6
4
5
12
11
10
['jjj', 'jjj']
<callable_iterator object at 0x000001AC88477048>


In [24]:
# for i, (word, wCount) in enumerate(newWordsCounter.items()):
#     if i >5000 and i < 6000:
#         print(word)

print(str.rfind('z', 'hh'))
print(str.rfind('zhh', 'hh'))
print(not str.rfind('z', 'hh'))
print(not str.rfind('zhh', 'hh'))

-1
1
False
False


In [42]:
model = {
    
}

# 添加子项到模型中
def addItemToModel(model, pinyinStr, word, wCount, weight = 1):
    if pinyinStr not in model:
        model[pinyinStr] = {word: [weight, wCount]} #[[wordData, weight]]
    elif word in model[pinyinStr]:
        wordData = model[pinyinStr][word]
        wordData[1] += wCount 
        if len(wordData) == 2: wordData.append(1)
        wordData[0] = (wordData[0] * wordData[2] + weight) / (wordData[2] + 1) # 求平均值
        wordData[2] += 1
    else:
        # model['pinyin'][pinyinStr].append([wordData, weight])
        model[pinyinStr][word] = [weight, wCount]
        
def isComplete(allPY, indexes):
    for __i, pys1 in enumerate(allPY):
        if indexes[__i] != len(allPY[__i]) - 1:
            return False
    return True

def addOne(allPY, indexes):
    for __i, pys1 in enumerate(allPY):
        if indexes[__i] < len(allPY[__i]) - 1:
            indexes[__i] += 1
            for _j in range(__i):
                indexes[_j] = 0
            break

print("newWordsCounter length : ", len(newWordsCounter))
for i, (word, wCount) in enumerate(newWordsCounter.items()):
    if i % 1000 == 999:
        print('{0}/{1}'.format(i + 1, len(newWordsCounter)))
        # break
    wordPYs = lazy_pinyin(word, errors='ignore')
    # https://pypi.org/project/pypinyin/  strict=False 可以让 y, w, u 成为声母
    wordPYs_initial = lazy_pinyin(word, errors='ignore', style=Style.INITIALS, strict=False)
    wordPYs_finals = lazy_pinyin(word, errors='ignore', style=Style.FINALS, strict=False)
    # 先将所有可能性都记录下来
    allPY = []
    allPyWeight = []
    for j, w in enumerate(word):
        # 原拼音
        allPY.append([wordPYs[j]])
        allPyWeight.append([1])
        # 原声母 (注意：可能没有声母，没有声母是，字符串为''字符串)
        if wordPYs_initial[j] != '':
            allPY[j].append(wordPYs_initial[j]) 
            allPyWeight[j].append(1)
        
        if len(word) <= 3: # 长度大于等于4的词就不处理了太费内存了。
            if wordPYs_initial[j] != '':
                for rpy, ftpys in faultTolerantMap['ShengMu'].items():
                    for ftpy, weight in ftpys.items():
                        newShmu = str.replace(wordPYs_initial[j], rpy, ftpy, 1)
                        if newShmu != wordPYs_initial[j] and str.rfind(newShmu, 'hh') < 0:
                            # 新声母
                            # print('新声母', newShmu)
                            allPY[j].append(newShmu)
                            allPyWeight[j].append(weight)
                            # 新声母 + 原韵母
                            # print('新声母 + 原韵母', newShmu + wordPYs_finals[j])
                            allPY[j].append(newShmu + wordPYs_finals[j])
                            allPyWeight[j].append(weight)
                            for rpy2, ftpys2 in faultTolerantMap['YunMu'].items():
                                for ftpy2, weight2 in ftpys2.items(): 
                                    newYunMu = str.replace(wordPYs_finals[j], rpy2, ftpy2, 1)
                                    if newYunMu != wordPYs_finals[j]:
                                        # print('韵母', 'new:', newYunMu, 'old:',  wordPYs_finals[j], rpy2, ftpy2)
                                        # 新声母 + 新韵母
                                        # print('新声母 + 新韵母', newShmu + newYunMu)
                                        allPY[j].append(newShmu + newYunMu)
                                        allPyWeight[j].append(min([weight, weight2]))
                for rpy2, ftpys2 in faultTolerantMap['YunMu'].items():
                    for ftpy2, weight2 in ftpys2.items(): 
                        newYunMu = str.replace(wordPYs_finals[j], rpy2, ftpy2, 1)
                        if newYunMu != wordPYs_finals[j]:
                            # 原声母 + 新韵母
                            # print('原声母 + 新韵母', wordPYs_initial[j] + newYunMu)
                            allPY[j].append(wordPYs_initial[j] + newYunMu)
                            allPyWeight[j].append(min([1, weight2]))
            else:
                for rpy2, ftpys2 in faultTolerantMap['YunMu'].items():
                    for ftpy2, weight2 in ftpys2.items(): 
                        newYunMu = str.replace(wordPYs_finals[j], rpy2, ftpy2, 1)
                        if newYunMu != wordPYs_finals[j]:
                            allPY[j].append(newYunMu)
                            allPyWeight[j].append(weight2)
    # 将所有可能性组合起来
    indexes = np.zeros(len(allPY)).astype('int').tolist()

    while not isComplete(allPY, indexes):
        newPy = ''
        newWeights = []
        for _i, pys in enumerate(allPY):
            newPy += pys[indexes[_i]]
            newWeights.append(allPyWeight[_i][indexes[_i]])
            
        addItemToModel(model, newPy, word, wCount, min(newWeights))
            
        addOne(allPY, indexes)

newWordsCounter length :  247150
1000/247150
2000/247150
3000/247150
4000/247150
5000/247150
6000/247150
7000/247150
8000/247150
9000/247150
10000/247150
11000/247150
12000/247150
13000/247150
14000/247150
15000/247150
16000/247150
17000/247150
18000/247150
19000/247150
20000/247150
21000/247150
22000/247150
23000/247150
24000/247150
25000/247150
26000/247150
27000/247150
28000/247150
29000/247150
30000/247150
31000/247150
32000/247150
33000/247150
34000/247150
35000/247150
36000/247150
37000/247150
38000/247150
39000/247150
40000/247150
41000/247150
42000/247150
43000/247150
44000/247150
45000/247150
46000/247150
47000/247150
48000/247150
49000/247150
50000/247150
51000/247150
52000/247150
53000/247150
54000/247150
55000/247150
56000/247150
57000/247150
58000/247150
59000/247150
60000/247150
61000/247150
62000/247150
63000/247150
64000/247150
65000/247150
66000/247150
67000/247150
68000/247150
69000/247150
70000/247150
71000/247150
72000/247150
73000/247150
74000/247150
75000/247150
7

In [44]:
# 写入 JSON 数据
with open('model.json', 'w') as f:
    json.dump(model, f)
    

In [None]:
# 读取数据
with open('model.json', 'r') as f:
    model = json.load(f)

推理过程：

In [43]:
# 用户输入的拼音能在字典中找到则直接计算出结果显示。
def getWordsByPinYin(model:dict, pinyinStr, wordCount = 5, isIgnoreCase = False):
    if isIgnoreCase: pinyinStr = str.lower(pinyinStr)
    if pinyinStr not in model:
        return [pinyinStr]
    else:
        allWords : dict = model[pinyinStr]
        wordData = [[key, value[0], value[1]]  for key,value in allWords.items()]
        res = sorted(wordData, key=lambda x:math.log(x[1] * x[2]))
        return [wordData[0] for i, wordData in enumerate(res) if i < wordCount]
        
print(getWordsByPinYin(model, 'gongchengshi'))
print(getWordsByPinYin(model, 'gcs'))
print(getWordsByPinYin(model, 'mi'))
print(getWordsByPinYin(model, 'shouji'))
print(getWordsByPinYin(model, 'Shouji'))
print(getWordsByPinYin(model, ''))
print(getWordsByPinYin(model, 'tianceng'))

['工程师']
['激次数', '锯齿山', '接触史', '继承税', '减仓式']
['幂', '糜', '咪', '醚', '谧']
['受静', '手巾', '手金', '受惊', '兽迹']
['Shouji']
['']
['天橙', '甜橙', '天秤', '天宸', '添乘']


In [None]:
# 如果用户输入的拼音太长则使用动态规划对拼音进行切割后执行1再拼接。
