## Lecture-11 Dynamic Programming

In [1]:
original_price = [1, 5, 8, 9, 10, 17, 17, 20, 24, 30, 35]

In [2]:
from collections import defaultdict

In [3]:
price = defaultdict(int)

In [4]:
for i, p in enumerate(original_price): 
    price[i + 1] = p

In [5]:
price[11]

35

## Get the max splitting by enumerate

In [8]:
def r(n):
    
    return max(
        [price[n]] + [r(i) + r(n-i) for i in range(1, n)]
    )

In [9]:
r(10)

30

In [10]:
r(15)

45

In [None]:
r(20) 

## Analysis: How to optimize

![](imgs/rod.png)

## A Simpler Problem

![](imgs/f.png)

#### We could make Fib Problem quick very easy!

In [13]:
# code here

In [17]:
from functools import wraps

In [18]:
def memo(f): 
    memo.already_computed = {}
    @wraps(f)
    def _wrap(arg):
        result = None
        
        if arg in memo.already_computed: 
            result = memo.already_computed[arg]
        else:
            result = f(arg)
            memo.already_computed[arg] = result
        
        return result
    
    return _wrap

## We use this method to solve Cut Rod probelm

In [21]:
solution = {}


In [22]:
@memo
def r(n):
    """
    Args: n is the iron length
    Return: the max revenue 
    """
    max_price, max_split = max(
        [(price[n], 0)] + [(r(i) + r(n-i), i) for i in range(1, n)], key=lambda x: x[0]
    )

    solution[n] = (n - max_split, max_split)
    
    return max_price

In [25]:
r(20)

60

In [26]:
solution

{1: (1, 0),
 2: (2, 0),
 3: (3, 0),
 4: (2, 2),
 5: (3, 2),
 6: (6, 0),
 7: (6, 1),
 8: (6, 2),
 9: (6, 3),
 10: (10, 0),
 11: (11, 0),
 12: (11, 1),
 13: (11, 2),
 14: (11, 3),
 15: (13, 2),
 16: (14, 2),
 17: (11, 6),
 18: (17, 1),
 19: (17, 2),
 20: (17, 3)}

## How do we parse solution?

In [27]:
def parse_solution(n):
    left_split, right_split = solution[n]
    
    if right_split == 0: return [left_split]
    
    return parse_solution(left_split) + parse_solution(right_split)

In [28]:
r(234)

743

In [29]:
parse_solution(234)

[11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 3]

## Edit Distance

In [31]:
## Edit Distance

In [32]:
solution = {}

In [34]:
from functools import lru_cache

In [37]:
@lru_cache(maxsize=2**10)
def edit_distance(string1, string2):
    
    if len(string1) == 0: return len(string2)
    if len(string2) == 0: return len(string1)
    
    tail_s1 = string1[-1]
    tail_s2 = string2[-1]
    
    candidates = [
        (edit_distance(string1[:-1], string2) + 1, 'DEL {}'.format(tail_s1)),  
        # string 1 delete tail
        (edit_distance(string1, string2[:-1]) + 1, 'ADD {}'.format(tail_s2)),  
        # string 1 add tail of string2
    ]
    
    if tail_s1 == tail_s2:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 0, '')
    else:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 1, 'SUB {} => {}'.format(tail_s1, tail_s2))

    candidates.append(both_forward)
    
    min_distance, operation = min(candidates, key=lambda x: x[0])
    
    solution[(string1, string2)] = operation 
    
    return min_distance

In [38]:
edit_distance('ABCDE', 'ABCCEF')

2

In [39]:
solution

{('A', 'A'): '',
 ('A', 'AB'): 'ADD B',
 ('A', 'ABC'): 'ADD C',
 ('A', 'ABCC'): 'ADD C',
 ('A', 'ABCCE'): 'ADD E',
 ('A', 'ABCCEF'): 'ADD F',
 ('AB', 'A'): 'DEL B',
 ('AB', 'AB'): '',
 ('AB', 'ABC'): 'ADD C',
 ('AB', 'ABCC'): 'ADD C',
 ('AB', 'ABCCE'): 'ADD E',
 ('AB', 'ABCCEF'): 'ADD F',
 ('ABC', 'A'): 'DEL C',
 ('ABC', 'AB'): 'DEL C',
 ('ABC', 'ABC'): '',
 ('ABC', 'ABCC'): 'ADD C',
 ('ABC', 'ABCCE'): 'ADD E',
 ('ABC', 'ABCCEF'): 'ADD F',
 ('ABCD', 'A'): 'DEL D',
 ('ABCD', 'AB'): 'DEL D',
 ('ABCD', 'ABC'): 'DEL D',
 ('ABCD', 'ABCC'): 'SUB D => C',
 ('ABCD', 'ABCCE'): 'ADD E',
 ('ABCD', 'ABCCEF'): 'ADD F',
 ('ABCDE', 'A'): 'DEL E',
 ('ABCDE', 'AB'): 'DEL E',
 ('ABCDE', 'ABC'): 'DEL E',
 ('ABCDE', 'ABCC'): 'DEL E',
 ('ABCDE', 'ABCCE'): '',
 ('ABCDE', 'ABCCEF'): 'ADD F'}

## Todo: Parse Solution is our homework

## Problem Case 3: Pinyin Auto Correction Problem

In [40]:
chinese_dataset = 'article_9k.txt'

In [41]:
CHINESE_CHARATERS = open(chinese_dataset).read()

In [44]:
CHINESE_CHARATERS[:10]

'此外自本周6月12日'

In [45]:
import pinyin

In [46]:
pinyin.get('你好', format="strip", delimiter=" ")

'ni hao'

In [47]:
def chinese_to_pinyin(character):
    return pinyin.get(character, format="strip", delimiter=" ")

In [50]:
CHINESE_PINYIN_CORPYS = chinese_to_pinyin(CHINESE_CHARATERS)

In [51]:
len(CHINESE_PINYIN_CORPYS)

129433034

In [57]:
import re

In [58]:
def tokens(text):
    "List all the pinyin characrters"
    return re.findall('[a-z]+', text.lower()) 

In [76]:
CHINESE_PINYIN_CORPYS[:100]

'ci wai zi ben zhou 6 yue 1 2 ri qi chu xiao mi shou ji 6 deng 1 5 kuan ji xing wai qi yu ji xing yi '

In [60]:
tokens(CHINESE_PINYIN_CORPYS[:100])

['ci',
 'wai',
 'zi',
 'ben',
 'zhou',
 'yue',
 'ri',
 'qi',
 'chu',
 'xiao',
 'mi',
 'shou',
 'ji',
 'deng',
 'kuan',
 'ji',
 'xing',
 'wai',
 'qi',
 'yu',
 'ji',
 'xing',
 'yi']

In [52]:
from collections import Counter, defaultdict

In [80]:
PINYIN_COUNT = Counter(tokens(CHINESE_PINYIN_CORPYS))

In [81]:
def correct(word):
    "Find the most possible pinyin based on edit distance."
    
    # Prefer edit distance 0, then 1, then 2; otherwise default to word itself.
    
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=PINYIN_COUNT.get)

In [82]:
def known(words):
    "Return the pinyin we have noticed."
    return {w for w in words if w in PINYIN_COUNT}

def edits0(word): 
    "Return all strings that are zero edits away from word (i.e., just word itself)."
    return {word}

def edits2(word):
    "Return all strings that are two edits away from this pinyin."
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

## $edits1()$ --> How?

In [83]:
def edits1(word):
    "Return all strings that are one edit away from this pinyin."
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def splits(word):
    "Return a list of all possible (first, rest) pairs that comprise pinyin."
    return [(word[:i], word[i:]) 
            for i in range(len(word)+1)]

alphabet = 'abcdefghijklmnopqrstuvwxyz'

In [84]:
splits('pinyin')

[('', 'pinyin'),
 ('p', 'inyin'),
 ('pi', 'nyin'),
 ('pin', 'yin'),
 ('piny', 'in'),
 ('pinyi', 'n'),
 ('pinyin', '')]

In [85]:
print(edits0('pinyin'))

{'pinyin'}


In [86]:
print(edits1('pinyin'))

{'pinylin', 'pinyitn', 'pinyinl', 'pinyjin', 'piuyin', 'pinyib', 'minyin', 'pibyin', 'pineyin', 'winyin', 'pfnyin', 'pinyihn', 'tinyin', 'pinyiln', 'pinyiv', 'pinyiyn', 'qpinyin', 'ponyin', 'pinkyin', 'pinfin', 'pinyiu', 'ainyin', 'pinoin', 'pinnyin', 'pinywin', 'pijnyin', 'pinain', 'pinyiny', 'pinyiwn', 'pifyin', 'pinyif', 'pizyin', 'pindin', 'piznyin', 'pinyrin', 'pinyzn', 'pinyiw', 'pingyin', 'pinyih', 'sinyin', 'epinyin', 'zinyin', 'pinyln', 'pinybn', 'pinyen', 'upinyin', 'cinyin', 'cpinyin', 'pinyicn', 'pidyin', 'pilyin', 'pinyis', 'gpinyin', 'hinyin', 'ptnyin', 'pirnyin', 'pinyqin', 'pinryin', 'pinmin', 'pinyijn', 'prnyin', 'pinykin', 'pinypn', 'pikyin', 'ginyin', 'piwyin', 'piinyin', 'pinyine', 'pinyizn', 'pinyinc', 'pihnyin', 'piynin', 'pivyin', 'peinyin', 'pintyin', 'pinyinn', 'pivnyin', 'pinyia', 'pinywn', 'pynyin', 'pilnyin', 'ninyin', 'piniyn', 'phnyin', 'pinyifn', 'pinygn', 'pbnyin', 'pinlyin', 'pinrin', 'zpinyin', 'oinyin', 'picyin', 'pinymin', 'pqinyin', 'pifnyin', 'kiny

In [87]:
print(len(edits1('pinyin')))

338


In [97]:
print(len(edits2('pinyin')))

52168


## It's the time show the power!

In [98]:
correct('yin')

'yin'

In [99]:
correct('yign')

'ying'

In [100]:
correct('yinn')

'ying'

In [101]:
def correct_sequence_pinyin(text_pingyin):
    return ' '.join(map(correct, text_pingyin.split()))

## 这是一个测试！

In [104]:
correct_sequence_pinyin('zhe sih yi ge ce sho')

'zhe shi yi ge ce shi'

## 我想上清华大学~

In [107]:
correct_sequence_pinyin('wo xiang shagn qinng hua da xue')

'wo xiang shang qing hua da xue'

### 思考题？ 如何在不带空格的时候完成自动修整？--> 如何完成拼音的自动分割？