In [1]:
import numpy as np
import functools


def load_words_freq_pinyin(file_name):
    init_words_freq = {}
    emit_pinyin_freq = {}
    transmission_freq = {}
    with open(file_name, "r") as words:
        for line in words:
            fields = line.split()
            freq = int(fields[0])
            is_first = True
            last_word = ""
            for word_pinyin in fields[1:]:
                word, pinyin = word_pinyin.split("|")

                word_emit_pinyin_freq = emit_pinyin_freq.get(word, {})
                word_emit_pinyin_freq[pinyin] = word_emit_pinyin_freq.get(pinyin, 0) + freq
                emit_pinyin_freq[word] = word_emit_pinyin_freq

                if is_first:
                    init_words_freq[word] = init_words_freq.get(word, 0) + freq
                    is_first = False
                else:
                    word_transmission_freq = transmission_freq.get(last_word, {})
                    word_transmission_freq[word] = word_transmission_freq.get(word, 0) + freq
                    transmission_freq[last_word] = word_transmission_freq

                last_word = word

    return init_words_freq, transmission_freq, emit_pinyin_freq


def normal_probability(freq, total_freq):
    return freq * 1.0 / total_freq


def build_hmm(init_words_freq, transmission_freq, emit_pinyin_freq):
    init_words_probability = {}
    transmission_probability = {}
    reverse_emit_pinyin_probability = {}

    init_word_total_freq = np.sum(list(init_words_freq.values()))

    for init_word, freq in init_words_freq.items():
        init_words_probability[init_word] = normal_probability(freq, init_word_total_freq)

    reverse_transmit_dict = {}
    for from_word, to_words_freq in transmission_freq.items():
        for to_word in to_words_freq.keys():
            word_reverse_transmit_set = reverse_transmit_dict.get(to_word, set({}))
            word_reverse_transmit_set.add(from_word)
            reverse_transmit_dict[to_word] = word_reverse_transmit_set

    for from_word, to_words_freq in transmission_freq.items():
        to_word_total_freqs = np.sum(list(to_words_freq.values()))
        for to_word, to_word_freq in to_words_freq.items():
            words_transmission_probability = transmission_probability.get(from_word, {})
            words_transmission_probability[to_word] = normal_probability(to_word_freq, to_word_total_freqs)
            transmission_probability[from_word] = words_transmission_probability

    for word, pinyins_freq in emit_pinyin_freq.items():
        total_pinyin_freq = np.sum(list(pinyins_freq.values()))
        for pinyin, freq in pinyins_freq.items():
            pinyin_words_probability = reverse_emit_pinyin_probability.get(pinyin, {})
            pinyin_words_probability[word] = normal_probability(freq, total_pinyin_freq)
            reverse_emit_pinyin_probability[pinyin] = pinyin_words_probability

    return init_words_probability, transmission_probability, reverse_emit_pinyin_probability

In [2]:
# 如果其他人跑这个例子，需要把路径替换下
init_words_freq, transmission_freq, emit_pinyin_freq = load_words_freq_pinyin(
        "/$path_to_file/code/kata-hmm/data/global_words_freq_pinyin.txt")

init_words_probability, transmission_probability, reverse_emit_pinyin_probability = build_hmm(init_words_freq,
                                                                                                  transmission_freq,
                                                                                                  emit_pinyin_freq)

In [3]:
reverse_emit_pinyin_probability["hai'xing"]

{'海星': 1.0, '海兴': 1.0, '海行': 1.0, '海幸': 1.0, '氦星': 1.0}

In [4]:
reverse_emit_pinyin_probability["zhong'guo"]

{'中国': 1.0, '种果': 1.0, '钟国': 1.0, '仲果': 1.0, '仲国': 1.0, '钟果': 1.0, '重国': 1.0}

In [5]:
reverse_emit_pinyin_probability["ren'min"]

{'人民': 1.0, '任民': 1.0, '任敏': 1.0, '任岷': 1.0, '任闽': 1.0}

In [6]:
reverse_emit_pinyin_probability["xiang'mu"]

{'项目': 1.0, '橡木': 1.0, '向慕': 1.0, '香木': 1.0, '相睦': 1.0}

In [7]:
reverse_emit_pinyin_probability["guan'li"]

{'管理': 1.0,
 '惯例': 1.0,
 '官吏': 1.0,
 '观礼': 1.0,
 '关里': 1.0,
 '官莉': 1.0,
 '关丽': 1.0,
 '关力': 1.0,
 '官立': 1.0,
 '官理': 1.0,
 '官利': 1.0,
 '官李': 1.0,
 '关莉': 1.0,
 '关立': 1.0,
 '官里': 1.0,
 '关礼': 1.0,
 '管李': 1.0,
 '管力': 1.0,
 '关李': 1.0,
 '官隶': 1.0,
 '管里': 1.0,
 '管立': 1.0,
 '观里': 1.0,
 '管莉': 1.0}

In [8]:
reverse_emit_pinyin_probability["xiang'mu'guan'li"]

{'项目管理': 1.0}

In [9]:
init_words_probability["的确"]

6.16053799425101e-05

In [10]:
reverse_emit_pinyin_probability["xi'an"]

{'西安': 1.0,
 '西岸': 1.0,
 '希安': 1.0,
 '锡安': 1.0,
 '溪岸': 1.0,
 '烯胺': 1.0,
 '喜按': 1.0,
 '西庵': 1.0}

In [11]:
init_words_probability['西安']

4.2566374027998134e-05

In [12]:
init_words_probability['仙']

4.718907802603711e-05

In [13]:
"西安人"  in transmission_probability

False

In [14]:
transmission_probability['现任']

{'者': 0.4823529411764706, '业': 0.41568627450980394, '法': 0.10196078431372549}

In [15]:
import numpy as np


def get_output_sequence_probability_by_forward(output_sequence, init_state_probability,
                                                    transmission_probability,
                                                    reverse_emission_probability):
    current_output_probability = {}

    for current_state, emit_observation_probability in reverse_emission_probability.get(output_sequence[0],
                                                                                        {}).items():
        hidden_state_probability = init_state_probability.get(current_state, -1)
        if hidden_state_probability < 0:
            continue
        current_output_probability[current_state] = hidden_state_probability * emit_observation_probability

    if len(current_output_probability) == 0:
        return 0, 0

    last_words_output_probability = current_output_probability
    next_to_match_index = 1

    for index, output in enumerate(output_sequence[1:], 1):
        current_output_probability = {}
        for current_state, emit_observation_probability in reverse_emission_probability.get(output, {}).items():
            to_current_state_probability = 0
            for last_state, last_probability in last_words_output_probability.items():
                the_transmission_probability = transmission_probability.get(last_state, {}).get(current_state, -1)
                if the_transmission_probability < 0:
                    continue
                print(last_state, current_state, the_transmission_probability)
                to_current_state_probability += last_probability * the_transmission_probability
            if to_current_state_probability != 0:
                current_output_probability[current_state] = to_current_state_probability * emit_observation_probability

        last_words_output_probability = current_output_probability

        if len(current_output_probability) == 0:
            next_to_match_index = index
            break
        else:
            next_to_match_index = index + 1

    return np.sum(list(last_words_output_probability.values())), next_to_match_index




get_output_sequence_probability_by_forward(["xi'an", "ren'min"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

西安 人民 0.006269110383383692


(2.66853297401917e-07, 2)

In [16]:

get_output_sequence_probability_by_forward(["xian'ren"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(2.2841498038741894e-05, 1)

In [17]:

get_output_sequence_probability_by_forward(["xiang'mux", "guan'li'zhe", "ni'hao"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(0, 0)

In [18]:

get_output_sequence_probability_by_forward([ "ni'hao"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(3.5544470878565256e-05, 1)

In [19]:
transmission_probability["项目"]["管理者"]

0.03953361267990527

In [20]:

def get_max_probability_states_by_outputs(output_sequence, init_state_probability, transmission_probability,
                                          reverse_emission_probability):
    output_probability_list = []

    current_output_probability = {}
    for current_state, emit_observation_probability in reverse_emission_probability.get(output_sequence[0],
                                                                                        {}).items():
        hidden_state_probability = init_state_probability.get(current_state, -1)
        if hidden_state_probability < 0:
            continue
        current_output_probability[current_state] = (hidden_state_probability * emit_observation_probability, None)

    if len(current_output_probability) == 0:
        return [], 0, 0

    output_probability_list.append(current_output_probability)
    next_process_index = 1

    last_state_output_probability = current_output_probability

    for index, output in enumerate(output_sequence[1:], 1):

        current_state_output_probability = {}

        for current_state, emit_observation_probability in reverse_emission_probability.get(output, {}).items():
            max_to_current_word_probability = 0
            parent_word = None

            for last_state, (last_output_probability, _) in last_state_output_probability.items():

                the_transmission_probability = transmission_probability.get(last_state, {}).get(current_state, -1)
                if the_transmission_probability < 0:
                    continue
                if last_output_probability * the_transmission_probability > max_to_current_word_probability:
                    max_to_current_word_probability = last_output_probability * the_transmission_probability
                    parent_word = last_state

            if max_to_current_word_probability != 0:
                word_output_probability = max_to_current_word_probability * emit_observation_probability
                current_state_output_probability[current_state] = (word_output_probability, parent_word)

        if len(current_state_output_probability) == 0:
            # 匹配失败，返回未匹配的序列的起始位置和已经匹配的结果
            break
        else:
            next_process_index = index + 1
            output_probability_list.append(current_state_output_probability)
            last_state_output_probability = current_state_output_probability

    top_state_output_probability = sorted(last_state_output_probability.items(), key=lambda kv: kv[1][0])[-1]
    # print(top_state_output_probability)
    max_probability_states = []
    max_probability_states.insert(0, top_state_output_probability[0])
    parent_word = top_state_output_probability[1][1]

    for output_probability in reversed(output_probability_list[0:-1]):
        # print(output_probability, output_probability.get(parent_word))
        max_probability_states.insert(0, parent_word)
        parent_word = output_probability.get(parent_word)[1]

    return max_probability_states, top_state_output_probability[1][0], next_process_index

get_max_probability_states_by_outputs(["xiang'mu", "guan'li'zhe"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['项目', '管理者'], 1.4815228889966823e-05, 2)

In [21]:
get_max_probability_states_by_outputs(["ming'que", "xiang'mu", "guan'li'fei"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['明确', '项目', '管理费'], 9.666320445311179e-09, 3)

In [22]:
reverse_emit_pinyin_probability["ren'min"]

{'人民': 1.0, '任民': 1.0, '任敏': 1.0, '任岷': 1.0, '任闽': 1.0}

In [23]:
init_words_probability['人民']

0.0003681877957097658

In [24]:
get_max_probability_states_by_outputs(["ren'min", "jiao'shi"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['人民'], 0.0003681877957097658, 1)

In [25]:
get_max_probability_states_by_outputs(["jiao'shi"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['教师'], 0.00011095383412473691, 1)

In [26]:
get_max_probability_states_by_outputs(["shang'hai", "fei'ji'chang"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['上海'], 0.00040481335462104543, 1)

In [27]:
get_max_probability_states_by_outputs(["shang'hai", "ji'chang"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['上海', '机场'], 5.385845179331931e-06, 2)

In [28]:
get_max_probability_states_by_outputs(["wo","kan", "hai", "xing"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['我'], 0.009121321887644876, 1)

In [29]:
get_max_probability_states_by_outputs(["xiang'mu","guan'li"], init_words_probability, transmission_probability,
                            reverse_emit_pinyin_probability)

(['项目'], 0.0003747501906775479, 1)

In [30]:
get_max_probability_states_by_outputs(["xiang'mu'guan'li"], init_words_probability, transmission_probability,  reverse_emit_pinyin_probability)

(['项目管理'], 2.985241692885244e-06, 1)

In [31]:
get_max_probability_states_by_outputs(["ai'guo"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['爱国'], 1.8340590092600513e-05, 1)

In [32]:
get_max_probability_states_by_outputs(["ai", "guo"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

(['爱'], 0.0007445421728480498, 1)

In [33]:
get_max_probability_states_by_outputs(['ang1', 'mu', 'gu', 'an', 'li'], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

([], 0, 0)

In [34]:

def get_hanzi_by_pinyin(pinyin, init_words_probability, transmission_probability, reverse_emit_pinyin_probability):
    hanzi_list = []
    hangzi_probability_list = []

    next_process_index = 0

    while next_process_index != len(pinyin):
        max_probability_states, word_probability,  rest_next_process_index = \
            get_max_probability_states_by_outputs(pinyin[next_process_index:], init_words_probability,
                                                  transmission_probability,
                                                  reverse_emit_pinyin_probability)
        if len(max_probability_states) == 0:
            break

        hanzi_list += max_probability_states
        hangzi_probability_list.append(word_probability)
        next_process_index += rest_next_process_index

    if next_process_index != len(pinyin):
        hanzi_list += pinyin[next_process_index:]

    return "-".join(hanzi_list), np.sum(np.log(hangzi_probability_list)), next_process_index

get_hanzi_by_pinyin(["xiang'mu", "guan'li"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

('项目-管理', -15.266582870311279, 2)

In [35]:
get_hanzi_by_pinyin(["ai", "guo"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

('爱-过', -13.95656809394384, 2)

In [36]:
get_hanzi_by_pinyin(["ming'que", "xiang'mu", "guan'li'fei"], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

('明确-项目-管理费', -18.454618112271746, 3)

In [37]:
get_hanzi_by_pinyin(['ang', 'mu', 'gu', 'an', 'li'], init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

('昂-木-股-按-里', -42.96642988781565, 5)

In [38]:
PINYIN_SET = set()
# 如果其他人跑这个例子，需要把路径替换下
with open('/$path_to_file/code/kata-hmm/data/intact_pinyin.txt', 'r', encoding='utf-8') as f:
    PINYIN_SET = set(s for s in f.read().split('\n'))

MAX_PINYIN_LEN = np.max(list(len(s) for s in PINYIN_SET))


def cut_pinyin(pinyin_str):
    pinyin_list = []
    for pinyin_len in range(MAX_PINYIN_LEN, 0, -1):
        if len(pinyin_str) >= pinyin_len and pinyin_str[0:pinyin_len] in PINYIN_SET:
            if len(pinyin_str) != pinyin_len:
                rest_pinyin_list = cut_pinyin(pinyin_str[pinyin_len:])
                if len(rest_pinyin_list) > 0:
                    for rest_pinyin_list in rest_pinyin_list:
                        pinyin_list.append([pinyin_str[0:pinyin_len]] + rest_pinyin_list)
            else:
                pinyin_list.append([pinyin_str[0:pinyin_len]])

    return pinyin_list

In [39]:

def get_hanzi_by_pinyin(pinyin_list, init_words_probability, transmission_probability, reverse_emit_pinyin_probability):
    hanzi_list = []
    hangzi_list_probability = 1

    next_process_index = 0

    while next_process_index != len(pinyin_list):
        max_probability_states, word_probability, rest_next_process_index = \
            get_max_probability_states_by_outputs(pinyin_list[next_process_index:], init_words_probability,
                                                  transmission_probability,
                                                  reverse_emit_pinyin_probability)
        if len(max_probability_states) == 0:
            break

        hanzi_list += max_probability_states
        hangzi_list_probability *= word_probability
        next_process_index += rest_next_process_index

    if next_process_index != len(pinyin_list):
        hanzi_list += pinyin_list[next_process_index:]

    return "-".join(hanzi_list), hangzi_list_probability, next_process_index == len(pinyin_list)


def get_all_pinyin_list(pinyin_list):
    all_pinyin_list = []
    max_word_len = 4
    for word_len in range(max_word_len, 0, -1):
        if len(pinyin_list) > word_len:
            rest_pinyin_list = get_all_pinyin_list(pinyin_list[word_len:])
            if len(rest_pinyin_list) > 0:
                for rest_pinyin_list in rest_pinyin_list:
                    all_pinyin_list.append(["'".join(pinyin_list[0:word_len])] + rest_pinyin_list)
        elif len(pinyin_list) == word_len:
            all_pinyin_list.append(["'".join(pinyin_list[0:word_len])])

    return all_pinyin_list

def get_all_hangzi_by_pinyin(pinyin_str, init_words_probability, transmission_probability,
                             reverse_emit_pinyin_probability):
    all_hanzi_list = []

    for raw_pinyin_list in cut_pinyin(pinyin_str):
        for combined_pinyin_list in get_all_pinyin_list(raw_pinyin_list):
            all_hanzi_list.append(
                get_hanzi_by_pinyin(combined_pinyin_list, init_words_probability, transmission_probability,
                                    reverse_emit_pinyin_probability))

    def my_comparator(hanzi_list_a, hanzi_list_b):
        if hanzi_list_a[2] == hanzi_list_b[2]:
            return hanzi_list_a[1] - hanzi_list_b[1]
        else:
            return hanzi_list_a[2] - hanzi_list_b[2]

    all_hanzi_list.sort(key=functools.cmp_to_key(my_comparator), reverse=True)

    return all_hanzi_list



In [40]:
cut_pinyin('dengfengshaolinsi')

[['deng', 'feng', 'shao', 'lin', 'si'],
 ['deng', 'feng', 'sha', 'o', 'lin', 'si']]

In [41]:
get_all_pinyin_list(['deng', 'feng', 'shao', 'lin', 'si'])

[["deng'feng'shao'lin", 'si'],
 ["deng'feng'shao", "lin'si"],
 ["deng'feng'shao", 'lin', 'si'],
 ["deng'feng", "shao'lin'si"],
 ["deng'feng", "shao'lin", 'si'],
 ["deng'feng", 'shao', "lin'si"],
 ["deng'feng", 'shao', 'lin', 'si'],
 ['deng', "feng'shao'lin'si"],
 ['deng', "feng'shao'lin", 'si'],
 ['deng', "feng'shao", "lin'si"],
 ['deng', "feng'shao", 'lin', 'si'],
 ['deng', 'feng', "shao'lin'si"],
 ['deng', 'feng', "shao'lin", 'si'],
 ['deng', 'feng', 'shao', "lin'si"],
 ['deng', 'feng', 'shao', 'lin', 'si']]

In [42]:
get_all_hangzi_by_pinyin('wokanhaixingya', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('我-看-海星-呀', 3.2629639027664655e-15, True),
 ('我-看-还-行-呀', 1.9127736991627e-15, True),
 ('我-看-还-兴亚', 5.043242702514251e-16, True),
 ('我-坎海-行-呀', 1.0260496073713046e-18, True),
 ('我-坎海-兴亚', 2.705295036761582e-19, True),
 ("wo'kan'hai'xing-ya", 1, False),
 ("wo'kan'hai-xing'ya", 1, False),
 ("wo'kan'hai-xing-ya", 1, False),
 ("wo'kan-hai'xing'ya", 1, False),
 ("wo'kan-hai'xing-ya", 1, False),
 ("wo'kan-hai-xing'ya", 1, False),
 ("wo'kan-hai-xing-ya", 1, False),
 ("我-kan'hai'xing'ya", 0.009121321887644876, False),
 ("我-kan'hai'xing-ya", 0.009121321887644876, False),
 ("我-看-hai'xing'ya", 1.4102206887847843e-05, False)]

In [43]:
get_all_hangzi_by_pinyin('xiangmuguanli', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('项目管理', 2.985241692885244e-06, True),
 ('项目-管理', 2.3431889029020788e-07, True),
 ('想-木-管理', 1.1673002354046274e-10, True),
 ('项目-管-里', 7.218419111655347e-11, True),
 ('项目-股-案例', 3.185566634552571e-12, True),
 ('项目-固安-里', 5.791772061884458e-14, True),
 ('想-募股-案例', 3.9743946628514626e-14, True),
 ('项目-股-按-里', 3.8335485484504726e-14, True),
 ('想-木-管-里', 3.595980809677243e-14, True),
 ('想-木-股-案例', 1.5869453281401694e-15, True),
 ('想-募股-按-里', 4.7828335233940285e-16, True),
 ('系-昂-木-管理', 3.6482395681983146e-16, True),
 ('锡昂-木-管理', 8.813583080655662e-17, True),
 ('想-木-固安-里', 2.8852718117922057e-17, True),
 ('想-木-股-按-里', 1.909748769080915e-17, True),
 ('系-昂-募股-案例', 1.2421434887850298e-19, True),
 ('系-昂-木-管-里', 1.1238753388753342e-19, True),
 ('锡昂-募股-案例', 3.0008267362521045e-20, True),
 ('锡昂-木-管-里', 2.7151091605449496e-20, True),
 ('系-昂-木-股-案例', 4.959783749792152e-21, True),
 ('系-昂-募股-按-里', 1.4948101592820326e-21, True),
 ('锡昂-木-股-案例', 1.1982071167126487e-21, True),
 ('锡昂-募股-按-里', 3.611230370

In [44]:
get_all_hangzi_by_pinyin('ceshiyixia', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('测试-一下', 2.592692338102422e-08, True),
 ('测试-意-下', 4.863770470524443e-09, True),
 ('侧-是-一下', 2.371240249117562e-10, True),
 ('侧-事宜-下', 2.163985365018632e-12, True),
 ('侧-是-一-下', 6.4185964643025e-13, True),
 ('测试-依稀-啊', 3.5874314851579074e-13, True),
 ('侧-是-依稀-啊', 3.2810147982247695e-15, True),
 ('测试-以-希阿', 6.066057623030774e-16, True),
 ('侧-事宜-西-阿', 3.0778472928163987e-16, True),
 ('测试-奕-希-啊', 3.0771364130615394e-16, True),
 ('侧-是-一-西-阿', 9.129202105839685e-17, True),
 ('侧-事宜-希阿', 7.0637305306567e-18, True),
 ('侧-是-一-希阿', 2.095172940712949e-18, True),
 ("ce'shi'yi'xia", 1, False),
 ("ce'shi'yi-xia", 1, False),
 ("ce'shi'yi'xi-a", 1, False),
 ("ce'shi'yi-xi'a", 1, False),
 ("ce'shi'yi-xi-a", 1, False),
 ("测试-yi'xi'a", 8.11575769138903e-05, False),
 ("侧-shi'yi'xia", 8.012317198329584e-05, False),
 ("侧-shi'yi'xi'a", 8.012317198329584e-05, False),
 ("侧-shi'yi'xi-a", 8.012317198329584e-05, False),
 ("侧-是-yi'xi'a", 7.422558784584514e-07, False)]

In [45]:
get_all_hangzi_by_pinyin('haikeyi', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('还-可以', 1.697512553259192e-05, True),
 ('还-可-意', 7.005086692155357e-09, True),
 ('海客-以', 2.6192965158066247e-10, True),
 ("hai'ke'yi", 1, False)]

In [46]:
get_all_hangzi_by_pinyin('maerkefu', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('马尔科夫', 4.1783260951606174e-08, True),
 ('马尔科-副', 7.31368766040317e-11, True),
 ('吗-而-克服', 3.5517864187800054e-11, True),
 ('马尔-克服', 6.982750090584918e-12, True),
 ('吗-儿科-副', 6.217604372346498e-13, True),
 ('吗-尔-可-副', 3.1635664938815894e-13, True),
 ('马尔-珂-芙', 8.363127176924516e-14, True),
 ("吗-er'ke'fu", 0.0005909742585494344, False)]

In [47]:
get_all_hangzi_by_pinyin('gailvlun', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('概率论', 4.7641532590181884e-07, True),
 ('概率-轮', 1.6419555972333708e-09, True),
 ('该-氯纶', 8.273427816729808e-12, True),
 ('该-率-轮', 6.59144271254014e-12, True)]

In [48]:
get_all_hangzi_by_pinyin('suiji', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('随即', 3.0253073172075254e-05, True), ('岁-及', 3.401922974165271e-07, True)]

In [49]:
get_all_hangzi_by_pinyin('guocheng', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('过程', 0.0002975277246658821, True),
 ('过-成', 6.418007454872767e-07, True),
 ('股-哦-成', 7.916073946007374e-11, True),
 ("gu'o'cheng", 1, False),
 ("gu'o-cheng", 1, False),
 ("股-o'cheng", 0.0004138486617353796, False)]

In [50]:
get_all_hangzi_by_pinyin('hangzhouhuochezhan', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('杭州-火车站', 1.1163515586734768e-06, True),
 ('航-州-火车站', 1.2200903104550078e-11, True),
 ('杭州-或-车站', 1.4115238495802154e-12, True),
 ('杭州-火车-占', 8.657306244382922e-13, True),
 ('杭州-或-车-占', 9.797344425229581e-15, True),
 ('杭州-湖-哦-车站', 1.1364299239399804e-15, True),
 ('航-周-火车-占', 1.2777384230502287e-16, True),
 ('杭州-湖-哦-车-占', 7.887925792602566e-18, True),
 ('航-周虎-哦-车站', 9.23490567486323e-21, True),
 ('航-周-和-车站', 1.3051541551689966e-21, True),
 ('航-周虎-哦-车-占', 6.40992014821784e-23, True),
 ('航-周-和-车-占', 9.0590355876828e-24, True),
 ('航-周-和-哦-车站', 8.909104492862172e-26, True),
 ('航-周-和-哦-车-占', 6.183782531403177e-28, True),
 ("hang'zhou'huo'che-zhan", 1, False),
 ("hang'zhou'huo-che'zhan", 1, False),
 ("hang'zhou'huo-che-zhan", 1, False),
 ("hang'zhou'hu'o-che'zhan", 1, False),
 ("hang'zhou'hu'o-che-zhan", 1, False),
 ("hang'zhou'hu-o'che'zhan", 1, False),
 ("hang'zhou'hu-o'che-zhan", 1, False),
 ("hang'zhou'hu-o-che'zhan", 1, False),
 ("hang'zhou'hu-o-che-zhan", 1, False),
 ("杭州-hu'o'che'zha

In [51]:
get_all_hangzi_by_pinyin('dajiatuniandajidali', init_words_probability, transmission_probability,
                              reverse_emit_pinyin_probability)

[('大家-兔年-大吉大利', 2.2215280590741e-15, True),
 ('大家-图-年-大吉大利', 2.0926630067755857e-15, True),
 ('大-家兔-年-大吉大利', 4.664401212895058e-17, True),
 ('大家-兔年-打击-大力', 3.18850330187041e-18, True),
 ('大家-图-年-打击-大力', 3.003546536156288e-18, True),
 ('大家-图-你-按-大吉大利', 7.2054703597592735e-19, True),
 ('打击-啊-兔年-大吉大利', 3.8601209725710834e-19, True),
 ('打击-啊-图-年-大吉大利', 3.63620541634971e-19, True),
 ('大-甲-兔年-大吉大利', 2.4074675554400197e-19, True),
 ('大-甲-图-年-大吉大利', 2.267816637608238e-19, True),
 ('大家-兔年-打击-达-里', 9.33745134412335e-20, True),
 ('大家-图-年-打击-达-里', 8.795810129698719e-20, True),
 ('大家-兔年-大-极大-里', 7.620522732160143e-20, True),
 ('大家-图-年-大-极大-里', 7.178476071344557e-20, True),
 ('大-家兔-年-打击-大力', 6.694697646431194e-20, True),
 ('大家-图-尼庵-大吉大利', 4.996495099906325e-20, True),
 ('大家-兔年-妲-己-大力', 3.615157532427428e-20, True),
 ('大家-图-年-妲-己-大力', 3.405451666872079e-20, True),
 ('打击-阿图-年-大吉大利', 2.1763960180900233e-20, True),
 ('大-家兔-你-按-大吉大利', 1.6060495443710373e-20, True),
 ('妲-己-啊-兔年-大吉大利', 4.376644490813394e-2