In [1]:
# 前向最大匹配
# sentence: 我们经常有意见分歧
# vocab: [我们，经常，有，有意见，意见，分歧]

# 前向最大匹配 
def forward_match(sentence,vocab,max_window_len = 3):
    word_list = []
    seq_len = len(sentence)
    start_index = 0
    while start_index < seq_len:
        for m_len in range(max_window_len,0,-1):
            char = sentence[start_index:start_index + m_len]
            if char in vocab:
                word_list.append(char)
                start_index = start_index + m_len
                break
    return word_list

In [2]:
sentence = '我们经常有意见分歧'
vocab = ['我们','经常','有','有意见','意见','分歧']
word_list = forward_match(sentence,vocab)
print(word_list)

['我们', '经常', '有意见', '分歧']


In [3]:
# 问题分步骤解决
# 1.首先构建中文Trie树

class Trie_Node:
    def __init__(self,char):
        self.char = char
        self.child = dict()
        self.isLeaf = False

# 构建trie树
class Trie:
    def __init__(self):
        self.root = Trie_Node(None)
        
    # 在trie树中添加单词
    def insert_word(self,word):
        cur_node = self.root
        for w_char in word:
            if w_char not in cur_node.child:
                new_node = Trie_Node(w_char)
                cur_node.child[w_char] = new_node  
            cur_node = cur_node.child[w_char]
           
        # 最后设置cur_node是叶子节点
        cur_node.isLeaf = True
        
        
    # Trie的字符串全字匹配，只返回True/False
    def full_match(self,word):
        cur_node = self.root

        for w_char in word:
            if w_char not in cur_node.child:
                #print('full match fail !!')
                return False
            else:
                cur_node = cur_node.child[w_char]
                    
        if cur_node.isLeaf == True and cur_node.char == word[-1]:
            #print('full match success!!')
            return True
    # 添加词库
    def insert_vocab(self,vocab):
        for word in vocab:
            self.insert_word(word)
            
    
    # 前序便利Trie树
    def preTraverse(self):
        pre_list = []
        def preTraverse_helper(cur_node):
            if cur_node is not None:
                if cur_node.char is not None:
                    pre_list.append(cur_node.char)
                #print(cur_node.char ,end=' ')
                for child_char,child_node in cur_node.child.items():
                    preTraverse_helper(child_node)
        preTraverse_helper(self.root)
        print(pre_list)
        
    def get_root(self):
        return self.root

In [4]:
def test_trie():
    trie = Trie()
    vocab = ['我们','经常','有','有意见','意见','分歧']
    trie.insert_vocab(vocab)
    trie.preTraverse()
    print(trie.full_match('我们经'))
    print(trie.full_match('有意见'))

In [5]:
test_trie()

['我', '们', '经', '常', '有', '意', '见', '意', '见', '分', '歧']
False
True


In [8]:
# 说说思路
# 首先从第一个词扩张，然后逐渐增大，判断子词是否在trie中
def cut_sentence(sentence,vocab):
    trie = Trie()
    trie.insert_vocab(vocab)
    
    start_index = 0
    max_window_len = 3 
    word_list = []
    seq_len = len(sentence)
    
    while start_index < seq_len:
        for m_len in range(max_window_len,0,-1):
            char = sentence[start_index:start_index + m_len]
            if char in vocab:
                word_list.append(char)
                start_index = start_index + m_len
                break
    return word_list

In [9]:
cut_sentence(sentence,vocab)

['我们', '经常', '有意见', '分歧']