Skip to content

Commit

Permalink
Merge pull request #2312 from dobbersc/GH-2311_segtok_sentence_splitt…
Browse files Browse the repository at this point in the history
…er_fix

GH-2311: Fix SegtokSentenceSplitter Incorrect Sentence Position Attributes
  • Loading branch information
alanakbik committed Jun 29, 2021
2 parents 0924997 + 32555da commit 96ac7cd
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 12 deletions.
31 changes: 19 additions & 12 deletions flair/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging

from abc import ABC, abstractmethod
from typing import List, Callable, Tuple
from typing import List, Callable, Optional

from segtok.segmenter import split_single, split_multi
from segtok.tokenizer import split_contractions, word_tokenizer

from more_itertools import stagger

from flair.data import Sentence, Tokenizer, Token

log = logging.getLogger("flair")
Expand Down Expand Up @@ -417,26 +419,31 @@ def __init__(self, tokenizer: Tokenizer = SegtokTokenizer()):
self._tokenizer = tokenizer

def split(self, text: str) -> List[Sentence]:
sentences = []
offset = 0
plain_sentences: List[str] = list(split_multi(text))

plain_sentences = split_multi(text)
for sentence in plain_sentences:
sentence_offset = text.find(sentence, offset)
try:
sentence_offset: Optional[int] = text.index(plain_sentences[0])
except ValueError as error:
raise AssertionError(f"Can't find the sentence offset for sentence {repr(plain_sentences[0])} "
f"from the text's starting position") from error

if sentence_offset == -1:
raise AssertionError(f"Can't find offset for sentences {plain_sentences} "
f"starting from {offset}")
sentences: List[Sentence] = []
for sentence, next_sentence in stagger(plain_sentences, offsets=(0, 1), longest=True):

sentences += [
sentences.append(
Sentence(
text=sentence,
use_tokenizer=self._tokenizer,
start_position=sentence_offset
)
]
)

offset += len(sentence)
offset: int = sentence_offset + len(sentence)
try:
sentence_offset = text.index(next_sentence, offset) if next_sentence is not None else None
except ValueError as error:
raise AssertionError(f"Can't find the sentence offset for sentence {repr(sentence)} "
f"starting from position {repr(offset)}") from error

return sentences

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ konoha<5.0.0,>=4.0.0
janome
gdown==3.12.2
huggingface-hub
more-itertools~=8.8.0

0 comments on commit 96ac7cd

Please sign in to comment.