In [61]:
import ahocorasick

# 基于Aho-Corasick匹配算法

In [62]:
# Then create an Automaton:
automaton = ahocorasick.Automaton()
automaton

<ahocorasick.Automaton at 0x2be75d58e30>

In [63]:
# You can use the Automaton class as a trie.
# Add some string keys and their associated value to this trie.
# Here we associate a tuple of (insertion index, original string) as a value to each key string we add to the trie:
for idx, key in enumerate('he her hers she'.split()):
    automaton.add_word(key, (idx, key))

In [64]:
# Then check if some string exists in the trie:
print('he' in automaton)
print('HER' in automaton)

True
False


In [65]:
# And play with the get() dict-like method:
print(automaton.get('he'))
print(automaton.get('she'))
print(automaton.get('cat', 'not exists'))
print(automaton.get('dog'))

(0, 'he')
(3, 'she')
not exists


KeyError: 

In [66]:
# Now convert the trie to an Aho-Corasick automaton to enable Aho-Corasick search:
automaton.make_automaton()


haystack = 'hh he her hers she whos'

In [67]:
# 匹配所有字符串
for end_index, (insert_order, original_value) in automaton.iter(haystack):
    start_index = end_index - len(original_value) + 1
    print((start_index, end_index, (insert_order, original_value)))

(3, 4, (0, 'he'))
(6, 7, (0, 'he'))
(6, 8, (1, 'her'))
(10, 11, (0, 'he'))
(10, 12, (1, 'her'))
(10, 13, (2, 'hers'))
(15, 17, (3, 'she'))
(16, 17, (0, 'he'))


In [68]:
# 匹配最长的字符串
for end_index, (insert_order, original_value) in automaton.iter_long(haystack):
    start_index = end_index - len(original_value) + 1
    print((start_index, end_index, (insert_order, original_value)))



(3, 4, (0, 'he'))
(6, 8, (1, 'her'))
(10, 13, (2, 'hers'))
(15, 17, (3, 'she'))
