# 1. Cut Rod Problem

In [1]:
from functools import wraps
from collections import defaultdict  

In [2]:
original_price = [1, 5, 8, 9, 10, 17, 17, 20, 24, 30]
price = defaultdict(int)
for i,p in enumerate(original_price):
    price[i+1] = p

In [3]:
def memo(f):
    memo.already_computed={}
    @wraps(f)
    def _wrap(arg):
        if arg in memo.already_computed:
            result = memo.already_computed[arg]
        else:
            result = f(arg)
            memo.already_computed[arg] = result
        return result
    return _wrap

In [4]:
solution={}
@memo
def r_op(n):
    """
    Args: n is the iron length
    Return: the max revenue 
    """
    max_price,max_split = max(
            [(price[n],0)] + [(r_op(i) + r_op(n-i), i) for i in range(1,n)],key=lambda x:x[0]
    )
    solution[n] = (n-max_split,max_split)
    return max_price

In [5]:
def parse_solution(n):
    left_split, right_split = solution[n]
    if right_split == 0: return [left_split]
    return parse_solution(left_split) + parse_solution(right_split)

In [6]:
r_op(20)

60

In [7]:
solution

{1: (1, 0),
 2: (2, 0),
 3: (3, 0),
 4: (2, 2),
 5: (3, 2),
 6: (6, 0),
 7: (6, 1),
 8: (6, 2),
 9: (6, 3),
 10: (10, 0),
 11: (10, 1),
 12: (10, 2),
 13: (10, 3),
 14: (12, 2),
 15: (13, 2),
 16: (10, 6),
 17: (16, 1),
 18: (16, 2),
 19: (16, 3),
 20: (10, 10)}

In [8]:
parse_solution(17)

[10, 6, 1]

# 2. Edit Distance

In [9]:
from functools import lru_cache

In [10]:
solution = {}
@lru_cache(maxsize=2**10) #max_size为缓存结果的个数，避免内存爆炸
def edit_distance(string1, string2):
    
    if len(string1) == 0: return len(string2) #s1='',s2='word',删掉s2里的字母
    if len(string2) == 0: return len(string1) #s1='word',s2='',插入s1里的字母
    
    tail_s1 = string1[-1]  #s1='word', tail_s1='d' -> 'wor'+'d'
    tail_s2 = string2[-1]  #s2='work', tail_s2='k' -> 'wor'+'k'
    
    candidates = [
        (edit_distance(string1[:-1], string2) + 1, 'DEL {}'.format(tail_s1)),  #edit_distance('wor','work')  
        # string 1 delete tail
        (edit_distance(string1, string2[:-1]) + 1, 'ADD {}'.format(tail_s2)), #edit_distance('word','wor')
        # string 1 add tail of string2
    ]
    
    #替换操作
    if tail_s1 == tail_s2:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 0, '')
    else:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 1, 'SUB {} => {}'.format(tail_s1, tail_s2))

    candidates.append(both_forward) #candidates = [d(DEL),d(INSERT),d(SUB,NOSUB)]
    
    min_distance, operation = min(candidates, key=lambda x: x[0]) #x[0]指根据distance排序
    
    solution[(string1, string2)] = operation 
    
    return min_distance

### Todo: Parse Solution is our homework

In [11]:
def delete_tail(string1,string2):
    if string1[-1] != string2[-1]: return string1,string2
    if string1 == string2: return string1,string2
    else: return delete_tail(string1[:-1],string2[:-1])

In [12]:
delete_tail('ABC','ABC')

('ABC', 'ABC')

In [13]:
def parse_solution_string(string1,string2,solution):
    if string1[-1] == string2[-1]: 
        string1,string2 = delete_tail(string1,string2)
    if string1 == string2 : return []
    else:
        s = solution[(string1,string2)]
        if s != '':
            op = s.split(" ")
            if op[0] == 'ADD':
                return [s] + parse_solution_string(string1+string2[-1],string2,solution)
            elif op[0] == 'DEL':
                return [s] + parse_solution_string(string1[:-1],string2,solution)
            elif op[0] == 'SUB':
                return [s] + parse_solution_string(string1[:-1]+string2[-1],string2,solution)

In [14]:
edit_distance('ABCDECG','ABCCEF')
parse_solution_string('ABCDECG','ABCCEF',solution)

['DEL G', 'SUB C => F', 'SUB D => C']

# 3.Pinyin Auto Correction Problem

In [15]:
import pinyin
import re
from collections import Counter,defaultdict

In [16]:
chinese_dataset='article_9k.txt'
CHINESE_CHARATERS = open(chinese_dataset).read()

In [17]:
def chinese_to_pinyin(character):
    "Convert Chinese characters to Pinyin"
    return pinyin.get(character,format='strip',delimiter=' ')

def tokens(text):
    "List all the pinyin characters, remove numbers"
    return re.findall('[a-z]+',text.lower())

In [85]:
CHINESE_CHARATERS_COPYS = chinese_to_pinyin(CHINESE_CHARATERS)
PINYIN_TOKEN = tokens(CHINESE_CHARATERS_COPYS)
PINYIN_COUNT = Counter(PINYIN_TOKEN)

In [19]:
def correct(word):
    'Find the most possible pinyin based on edit distance'
    # Prefer edit distance 0, then 1, then 2; otherwist default to word itself
    candidates = (known(edits0(word)) or #known语料库里的拼音集合
                  known(edits1(word)) or
                  known(edits2(word)) or
                  [word]) 
    return max(candidates,key=PINYIN_COUNT.get) 

def known(words):
    'Return the pinyin in our data'
    return {w for w in words if w in PINYIN_COUNT}

def edits0(word): #编辑距离为0 = 原词
    'Return all strings that are zero edits away from word (i.e., just word itself).'
    return {word}

def edits2(word): #编辑距离为2 = 编辑距离1的词上再加一个编辑距离
    'Return all strings that are two edits away from this pinyin.'
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

In [20]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def splits(word): #一个词的所有可能拆解方式
    'Return a list of all possible (first, rest) pairs that comprise pinyin.'
    return [(word[:i], word[i:]) for i in range(len(word)+1)]

def edits1(word):
    'Return all strings that are one edit away from this pinyin.'
    pairs = splits(word)
    deletes = [a+b[1:] for (a,b) in pairs if b] #删除[inyin,pnyin,piyin,pinin,pinyn,pinyi]
    transposes = [a+b[1]+b[0]+b[2:] for (a,b) in pairs if len(b) > 1] #翻转[ipnyin,]
    replaces = [a+c+b[1:] for (a,b) in pairs for c in alphabet if b] #代替
    inserts = [a+c+b for (a,b) in pairs for c in alphabet] #插入
    return set(deletes + transposes + replaces + inserts)

In [21]:
def correct_sentences_pinyin(text_pinyin):
    return ' '.join(map(correct, text_pinyin.split()))

In [22]:
correct_sentences_pinyin('zhe sih yi ge ce sho')

'zhe shi yi ge ce shi'

In [23]:
correct_sentences_pinyin('wo xiang shagn qinng hua da xue')

'wo xiang shang qing hua da xue'

In [24]:
correct_sentences_pinyin('zhe jiang gogn ye da xue')

'zhe jiang gong ye da xue'

### 思考题：如何在不带空格的时候完成自动修整？--> 如何完成拼音的自动分割？（提示：使用第一节课提到的语言模型!）

In [25]:
import numpy as np
import pinyin
import re
from collections import Counter,defaultdict
from functools import lru_cache

In [119]:
def get_prob1(word):
    if word in PINYIN_TOKEN:
        return PINYIN_COUNT[word]/len(PINYIN_TOKEN)
    else:
        return 0

In [120]:
solution={}
@lru_cache(maxsize=2**10)
def r_pinyin(words):
    """
    Args: n is the iron length
    Return: the max revenue 
    """
    if len(words) == 1: solution[words] = 0
    max_prob, max_split = max(
        [(get_prob1(words), 0)] + [((r_pinyin(words[:i])*r_pinyin(words[i:])), i) for i in range(1, len(words))], key=lambda x: x[0]
    )
    solution[words] = max_split
    return max_prob

In [121]:
def parse(words):
    max_prob = r_pinyin(words)
    i = solution[words]
    if i == 0: return [words]
    left,right = words[:i],words[i:]
    return parse(left) + parse(right)

Test：

[1] words1 = 'pinyin'

In [122]:
words1 = 'pinyin'
r_pinyin(words1)

6.7808865436114305e-06

In [123]:
solution

{'p': 0,
 'i': 0,
 'n': 0,
 'y': 0,
 'in': 1,
 'yi': 0,
 'yin': 0,
 'ny': 1,
 'nyi': 1,
 'nyin': 1,
 'iny': 1,
 'inyi': 1,
 'inyin': 1,
 'pi': 0,
 'pin': 0,
 'piny': 3,
 'pinyi': 3,
 'pinyin': 3}

In [124]:
parse(words1)

['pin', 'yin']

[2] words2 = 'zheshiyigeceshi'

In [125]:
words2 = 'zheshiyigeceshi'
r_pinyin(words2)

6.570359835839265e-13

In [126]:
solution

{'p': 0,
 'i': 0,
 'n': 0,
 'y': 0,
 'in': 1,
 'yi': 0,
 'yin': 0,
 'ny': 1,
 'nyi': 1,
 'nyin': 1,
 'iny': 1,
 'inyi': 1,
 'inyin': 1,
 'pi': 0,
 'pin': 0,
 'piny': 3,
 'pinyi': 3,
 'pinyin': 3,
 'z': 0,
 'h': 0,
 'e': 0,
 's': 0,
 'g': 0,
 'c': 0,
 'hi': 1,
 'sh': 1,
 'shi': 0,
 'es': 1,
 'esh': 1,
 'eshi': 1,
 'ce': 0,
 'ces': 2,
 'cesh': 2,
 'ceshi': 2,
 'ec': 1,
 'ece': 1,
 'eces': 1,
 'ecesh': 1,
 'eceshi': 1,
 'ge': 0,
 'gec': 2,
 'gece': 2,
 'geces': 2,
 'gecesh': 2,
 'geceshi': 2,
 'ig': 1,
 'ige': 1,
 'igec': 1,
 'igece': 1,
 'igeces': 1,
 'igecesh': 1,
 'igeceshi': 1,
 'yig': 2,
 'yige': 2,
 'yigec': 4,
 'yigece': 4,
 'yigeces': 6,
 'yigecesh': 7,
 'yigeceshi': 6,
 'iy': 1,
 'iyi': 1,
 'iyig': 1,
 'iyige': 1,
 'iyigec': 1,
 'iyigece': 1,
 'iyigeces': 1,
 'iyigecesh': 1,
 'iyigeceshi': 1,
 'hiy': 1,
 'hiyi': 2,
 'hiyig': 1,
 'hiyige': 1,
 'hiyigec': 1,
 'hiyigece': 1,
 'hiyigeces': 2,
 'hiyigecesh': 1,
 'hiyigeceshi': 8,
 'shiy': 3,
 'shiyi': 3,
 'shiyig': 3,
 'shiyige': 3,
 

In [127]:
parse(words2)

['zhe', 'shi', 'yi', 'ge', 'ce', 'shi']

[3] words3 = 'woxiangshangqinghuadaxue'

In [128]:
words3 = 'woxiangshangqinghuadaxue'
r_pinyin(words3)

5.114632668805757e-17

In [129]:
solution

{'p': 0,
 'i': 0,
 'n': 0,
 'y': 0,
 'in': 1,
 'yi': 0,
 'yin': 0,
 'ny': 1,
 'nyi': 1,
 'nyin': 1,
 'iny': 1,
 'inyi': 1,
 'inyin': 1,
 'pi': 0,
 'pin': 0,
 'piny': 3,
 'pinyi': 3,
 'pinyin': 3,
 'z': 0,
 'h': 0,
 'e': 0,
 's': 0,
 'g': 0,
 'c': 0,
 'hi': 1,
 'sh': 1,
 'shi': 0,
 'es': 1,
 'esh': 1,
 'eshi': 1,
 'ce': 0,
 'ces': 2,
 'cesh': 2,
 'ceshi': 2,
 'ec': 1,
 'ece': 1,
 'eces': 1,
 'ecesh': 1,
 'eceshi': 1,
 'ge': 0,
 'gec': 2,
 'gece': 2,
 'geces': 2,
 'gecesh': 2,
 'geceshi': 2,
 'ig': 1,
 'ige': 1,
 'igec': 1,
 'igece': 1,
 'igeces': 1,
 'igecesh': 1,
 'igeceshi': 1,
 'yig': 2,
 'yige': 2,
 'yigec': 4,
 'yigece': 4,
 'yigeces': 6,
 'yigecesh': 7,
 'yigeceshi': 6,
 'iy': 1,
 'iyi': 1,
 'iyig': 1,
 'iyige': 1,
 'iyigec': 1,
 'iyigece': 1,
 'iyigeces': 1,
 'iyigecesh': 1,
 'iyigeceshi': 1,
 'hiy': 1,
 'hiyi': 2,
 'hiyig': 1,
 'hiyige': 1,
 'hiyigec': 1,
 'hiyigece': 1,
 'hiyigeces': 2,
 'hiyigecesh': 1,
 'hiyigeceshi': 8,
 'shiy': 3,
 'shiyi': 3,
 'shiyig': 3,
 'shiyige': 3,
 

In [130]:
parse(words3)

['wo', 'xiang', 'shang', 'qing', 'hua', 'da', 'xue']

截止目前，没有错误的语句可以分割

[4] words4 = 'zhesihyigecesho'

In [131]:
words4 = 'zhesihyigecesho'
r_pinyin(words3)

5.114632668805757e-17

In [132]:
solution

{'p': 0,
 'i': 0,
 'n': 0,
 'y': 0,
 'in': 1,
 'yi': 0,
 'yin': 0,
 'ny': 1,
 'nyi': 1,
 'nyin': 1,
 'iny': 1,
 'inyi': 1,
 'inyin': 1,
 'pi': 0,
 'pin': 0,
 'piny': 3,
 'pinyi': 3,
 'pinyin': 3,
 'z': 0,
 'h': 0,
 'e': 0,
 's': 0,
 'g': 0,
 'c': 0,
 'hi': 1,
 'sh': 1,
 'shi': 0,
 'es': 1,
 'esh': 1,
 'eshi': 1,
 'ce': 0,
 'ces': 2,
 'cesh': 2,
 'ceshi': 2,
 'ec': 1,
 'ece': 1,
 'eces': 1,
 'ecesh': 1,
 'eceshi': 1,
 'ge': 0,
 'gec': 2,
 'gece': 2,
 'geces': 2,
 'gecesh': 2,
 'geceshi': 2,
 'ig': 1,
 'ige': 1,
 'igec': 1,
 'igece': 1,
 'igeces': 1,
 'igecesh': 1,
 'igeceshi': 1,
 'yig': 2,
 'yige': 2,
 'yigec': 4,
 'yigece': 4,
 'yigeces': 6,
 'yigecesh': 7,
 'yigeceshi': 6,
 'iy': 1,
 'iyi': 1,
 'iyig': 1,
 'iyige': 1,
 'iyigec': 1,
 'iyigece': 1,
 'iyigeces': 1,
 'iyigecesh': 1,
 'iyigeceshi': 1,
 'hiy': 1,
 'hiyi': 2,
 'hiyig': 1,
 'hiyige': 1,
 'hiyigec': 1,
 'hiyigece': 1,
 'hiyigeces': 2,
 'hiyigecesh': 1,
 'hiyigeceshi': 8,
 'shiy': 3,
 'shiyi': 3,
 'shiyig': 3,
 'shiyige': 3,
 

In [133]:
parse(words4)

['zhe', 'si', 'h', 'yi', 'ge', 'ce', 's', 'h', 'o']

<font color='red'>无法对错误单词进行分割，需结合自动纠错方法</font>