In [9]:
# _*_ coding: utf-8 _*_
import re
import time
CHPattern = re.compile(u'[\u4e00-\u9fa5]+') #所有汉字的unicode编码范围


def load_dictionary(dict_str):
    '''
    :param filename: string 分词所用的字典路径和文件名
    :return toBeSegmentedText: set<string> 分词所用的字典
    '''
    startTime = time.time()
    dictionary = dict_str.split('\n')
    #dictionary = dict(zip(dictionary,dictionary))
    dictionary = set(dictionary)
    maxLen = 1
    for i in dictionary:
        if len(i) > maxLen:
            maxLen = len(i)
    endTime = time.time()
    print ("load dictionary costs:", endTime-startTime, "s")
    return dictionary, maxLen


#正向最大匹配（Forward Maximum Matching，FMM)
def FMM(sentences, dicionary, maxLen):
    '''
    :param sentences: string    待分词的文本
    :param dictionary: list     词典
    :param maxLen: int          词典中词的最大长度
    :return RMMResult: string   分词的结果
    :return singleNumber: int   分词结果中的单字个数
    '''
    startTime = time.time()    
    FMMResult = ''
    singleNumber = 0 #初始化单个汉字的个数
    while len(sentences) > 0:
        lens = maxLen
        if len(sentences) < lens:
            lens = len(sentences)
        word = sentences[:lens]   #取出词
        while word not in dicionary:
            if len(word) > 1:
                word = word[:len(word)-1]
            if len(word) == 1:
                break
        FMMResult = FMMResult + word + '\\'
        if len(word) == 1 and CHPattern.search(word):  #判断取出的词长度是否为1，并且为汉字
            singleNumber += 1
        sentences = sentences[len(word):]
    endTime = time.time()
    print ("FMM costs:", endTime-startTime, "s")
    return FMMResult, singleNumber

#逆向最大匹配（Reverse Maximum Matching，RMM）
def RMM(sentences,dictionary,maxLen):
    '''
    :param sentences: string    待分词的文本
    :param dictionary: list     词典
    :param maxLen: int          词典中词的最大长度
    :return RMMResult: string   分词的结果
    :return singleNumber: int   分词结果中的单字个数
    '''
    startTime = time.time()
    RMMResult = ''
    singleNumber = 0   #初始化单个汉字的个数
    while len(sentences) > 0 :
        lens = maxLen
        if len(sentences) < lens:
            lens = len(sentences)
        word = sentences[-lens:]
        while word not in dictionary:
            if len(word) > 1:
                word =  word[-len(word)+1:]
            if len(word) == 1:
                break
        RMMResult = word + '\\' + RMMResult
        if len(word) == 1 and CHPattern.search(word): #判断取出的词长度是否为1，并且为汉字
            singleNumber +=  1         
        sentences = sentences[:-len(word)]
    endTime = time.time()
    print ("RMM costs:", endTime-startTime, "s")
    return RMMResult, singleNumber

# 双向最大匹配（Bi-directction Maximum Matching，BMM）
def BMM(sentences1, sentences2, count1, count2):
    lens1 = len(sentences1)
    lens2 = len(sentences2)
    if lens1 != lens2:
        if lens1 < lens2:
            return sentences1
        else:
            return sentences2
    elif sentences1 == sentences2:
        return sentences1
    else:
        if count1 <= count2:
            return sentences1
        else:
            return sentences2

In [5]:
dict_str = """昆明
理工
大学
信息
学院
计算机
昆明理工大学
开设
课程
"""

In [10]:
#主函数
def main():
    d, m = load_dictionary(dict_str)
    s = "昆明理工大学信息工程与自动化学院计算机系开设课程如下："
    print ('=============待分词的文本=========================')
    print (s)
    print ('=================================================')
    FMMresult,count1 = FMM(s, d, m)
    print ('============正向最大匹配算法分词结果:=============')
    print (FMMresult)
    print ('单字个数：', count1)
    print ('=================================================')
    RMMresult,count2 = RMM(s, d, m)
    print ('============逆向最大匹配算法分词结果:==============')
    print (RMMresult)
    print ('单字个数：', count2)
    print ('=================================================')
    BMMrusult = BMM(FMMresult, RMMresult, count1, count2)
    print ('============双向最大匹配算法分词结果:==============')
    print (BMMrusult)
    print ('=================================================')


if __name__ == '__main__':
    main()

load dictionary costs: 1.4781951904296875e-05 s
昆明理工大学信息工程与自动化学院计算机系开设课程如下：
FMM costs: 0.00012040138244628906 s
昆明理工大学\信息\工\程\与\自\动\化\学院\计算机\系\开设\课程\如\下\：\
单字个数： 9
RMM costs: 0.00012493133544921875 s
昆明理工大学\信息\工\程\与\自\动\化\学院\计算机\系\开设\课程\如\下\：\
单字个数： 9
昆明理工大学\信息\工\程\与\自\动\化\学院\计算机\系\开设\课程\如\下\：\
