In [1]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False
        self.words = []  # 用来存储以当前节点为终止的所有词

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.words.append(word)  # 将当前词添加到以该节点为结束的词列表中

    def search(self, text, start_index):
        node = self.root
        matches = []
        for i in range(start_index, len(text)):
            char = text[i]
            if char not in node.children:
                break
            node = node.children[char]
            if node.is_end_of_word:
                for word in node.words:
                    matches.append((start_index, start_index + len(word) - 1, word))

        return matches if matches else None

def annotate_text(text, keywords):
    # 构建Trie树
    trie = Trie()
    for word in keywords:
        trie.insert(word)

    # 标注文本
    annotations = []
    i = 0
    while i < len(text):
        matches = trie.search(text, i)
        if matches:
            # print(f"matches:{matches}")
            max_matched = max(matches,key=lambda x: x[1])
            annotations.append(max_matched)
            i += len(max_matched[2])
            # for match in matches:
            #     print(f"match:{match}")
            #     annotations.append(match)
            #     i += len(match[2])  # 跳过已经匹配的词
        else:
            i += 1

    return annotations

# 示例用法
text = "甲状腺是一个重要的器官，甲状腺疾病很常见，甲状腺功能亢进是一种常见的病症。"
keywords = ["甲状腺", "甲状"]
annotations = annotate_text(text, keywords)
print(annotations)


[(0, 2, '甲状腺'), (12, 14, '甲状腺'), (21, 23, '甲状腺')]
