In [106]:
import operator
import numpy as np

In [130]:
class MM(object):
    def __init__(self, dict_path):
        """读取词典文件，并设置最大词条字符数
        
        Args: 
            dict_path 词典路径
        """
        self.dictionary = set()
        self.maximum = 0
        
        with open(dict_path, 'r', encoding = 'utf8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                self.dictionary.add(line)
                if self.maximum < len(line):
                    self.maximum = len(line)
        
        
    def rmm_cut(self, text):
        """对text按照词典按照逆向最大匹配进行分词

        Args: 
            text 待分词的词典
        Returns:
            分词后的字符串
        """
        result = []
        index = len(text)

        while index > 0:
            word = None
            for size in range(self.maximum, 0, -1):
                if index - size < 0:
                    continue
                piece = text[(index - size):index]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    index -= size
                    break
            if word is None:
                index -= 1
        return result[::-1]
       
        
    def mm_cut(self, text, dictionary=None):
        """对text按照正向最大匹配进行分词
        """
        if dictionary is None:
            dictionary = self.dictionary
        
        result = []
        index = len(text)
        start = 0
        
        while start < index:
            word = None
            for size in range(self.maximum, 0, -1):
                piece = text[start: start + size]
                if piece in dictionary:
                    word = piece
                    result.append(word)
                    start = start + size
                    break
            if word is None:
                start = start + 1
        return result[::1]
    
    def reverse_dict(self, result):
        """对词典逆序
        """
        result_inverse = set()
        for value in result:
            result_inverse.add(value[::-1])
        return result_inverse  
        
    def rmm_cut_2(self, text):
        """逆向最大匹配，通过正向来计算
        
        处理思路：将词逆序，将词典逆序，然后调用正向最大匹配方法
        """
        dictionary_inverse = self.reverse_dict(self.dictionary)
        text = text[::-1]
        result_inverse = self.mm_cut(text, dictionary_inverse)
        # 对结果词对做逆序
        result = self.reverse_dict(result_inverse)
        return result
    
    def single_character_quantitiy(self, result_list):
        count = 0
        for value in result_list:
            if len(value) == 1:
                count += 1
        return count
    
    def bi_direction_mm(self, text):
        """双向最大匹配，取正向和逆向匹配结果词对较少的作为结果
        
        如果正向和逆向的结果词对数不同，返回词对数少的。
        如果正向和逆向的结果词对数不同：
            如果结果完全一致，则返回任一个；
            如果结果不一致，返回单字个数少的。
        """
        
        result_mm = self.mm_cut(text)
        result_imm = self.rmm_cut_2(text)
        
        if operator.eq(result_mm, result_imm):
            return result_mm
        else:
            mm_single_count = self.single_character_quantitiy(result_mm)
            imm_single_count = self.single_character_quantitiy(result_imm)
            
            # 取单字最少的
            result = result_mm if np.min([mm_single_count, imm_single_count]) == mm_single_count else result_imm
            return result
        

In [131]:
data = './data/imm_dict.data'
tokenizer = MM(data)
text = "南京市长江大桥"

In [132]:
tokenizer.rmm_cut(text)

['南京市', '长江大桥']

In [133]:
tokenizer.mm_cut(text)

['南京市长', '江', '大桥']

In [134]:
tokenizer.rmm_cut_2(text)

{'南京市', '长江大桥'}

In [135]:
tokenizer.bi_direction_mm(text)

{'南京市', '长江大桥'}