From 0fc8bd11e793721b094439c2e62b4abf3838116e Mon Sep 17 00:00:00 2001 From: DeanEby Date: Fri, 14 Feb 2025 10:02:15 -0500 Subject: [PATCH] ngram tokenizer split on whitespace --- analyzers/ngrams/main.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/analyzers/ngrams/main.py b/analyzers/ngrams/main.py index a0c81ad0..01525717 100644 --- a/analyzers/ngrams/main.py +++ b/analyzers/ngrams/main.py @@ -102,10 +102,7 @@ def get_ngram_rows(ngrams_by_id: dict[str, int]): def tokenize(input: str) -> list[str]: """Generate words from input string.""" - - output = re.split(r"\W+", input.lower()) - output = [value for value in output if "http" not in value] - return output + return re.split(" +", input.lower()) def ngrams(tokens: list[str], min: int, max: int):