Merge pull request #2312 from dobbersc/GH-2311_segtok_sentence_splitt…

…er_fix GH-2311: Fix SegtokSentenceSplitter Incorrect Sentence Position Attributes
flairNLP · Jun 29, 2021 · 96ac7cd · 96ac7cd
2 parents 0924997 + 32555da
commit 96ac7cd
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 12 deletions.
diff --git a/flair/tokenization.py b/flair/tokenization.py
@@ -1,11 +1,13 @@
 import logging
 
 from abc import ABC, abstractmethod
-from typing import List, Callable, Tuple
+from typing import List, Callable, Optional
 
 from segtok.segmenter import split_single, split_multi
 from segtok.tokenizer import split_contractions, word_tokenizer
 
+from more_itertools import stagger
+
 from flair.data import Sentence, Tokenizer, Token
 
 log = logging.getLogger("flair")
@@ -417,26 +419,31 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()):
         self._tokenizer = tokenizer
 
     def split(self, text: str) -> List[Sentence]:
-        sentences = []
-        offset = 0
+        plain_sentences: List[str] = list(split_multi(text))
 
-        plain_sentences = split_multi(text)
-        for sentence in plain_sentences:
-            sentence_offset = text.find(sentence, offset)
+        try:
+            sentence_offset: Optional[int] = text.index(plain_sentences[0])
+        except ValueError as error:
+            raise AssertionError(f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} "
+                                 f"from the text's starting position") from error
 
-            if sentence_offset == -1:
-                raise AssertionError(f"Can't find offset for sentences {plain_sentences} "
-                                     f"starting from {offset}")
+        sentences: List[Sentence] = []
+        for sentence, next_sentence in stagger(plain_sentences, offsets=(0, 1), longest=True):
 
-            sentences += [
+            sentences.append(
                 Sentence(
                     text=sentence,
                     use_tokenizer=self._tokenizer,
                     start_position=sentence_offset
                 )
-            ]
+            )
 
-            offset += len(sentence)
+            offset: int = sentence_offset + len(sentence)
+            try:
+                sentence_offset = text.index(next_sentence, offset) if next_sentence is not None else None
+            except ValueError as error:
+                raise AssertionError(f"Can't find the sentence offset for sentence {repr(sentence)} "
+                                     f"starting from position {repr(offset)}") from error
 
         return sentences
 

diff --git a/requirements.txt b/requirements.txt
@@ -21,3 +21,4 @@ konoha<5.0.0,>=4.0.0
 janome
 gdown==3.12.2
 huggingface-hub
+more-itertools~=8.8.0