Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions metaclip/substr_matching.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
import ahocorasick


automaton = None
spaced_metadata = None

def initialize_automaton(metadata):
automaton = ahocorasick.Automaton()
for idx, key in enumerate(spaced_metadata):
automaton.add_word(key, (idx, key))
automaton.make_automaton()
return automaton

def spacing(text):
puncts_to_wrap = [",", ".", ";", ":", "?", "!", "`"]
chars_to_space = ["\t", "\n", "\r"]
Expand All @@ -18,14 +27,14 @@ def spacing(text):


def substr_matching(text, metadata):
global spaced_metadata
global spaced_metadata, automaton
if spaced_metadata is None:
spaced_metadata = []
for entry in metadata:
spaced_metadata.append(f" {entry} ")
spaced_metadata = [f" {entry} " for entry in metadata]
text = spacing(text)
matched_entry_ids = []
for entry_id, entry in enumerate(spaced_metadata):
if entry in text:
matched_entry_ids.append(entry_id)
return matched_entry_ids
if automaton is None:
automaton = initialize_automaton(metadata)
matched_entry_ids = set()
for end_index, (entry_id, original_value) in automaton.iter(text):
matched_entry_ids.add(entry_id)
return list(matched_entry_ids)

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ torchvision
regex
ftfy
tqdm
pyahocorasick