Skip to content

Commit

Permalink
Add support for custom trained PunktTokenizer in PreProcessor (#2783)
Browse files Browse the repository at this point in the history
* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
  • Loading branch information
3 people committed Jul 21, 2022
1 parent f51587b commit 3948b99
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 18 deletions.
3 changes: 2 additions & 1 deletion docs/_src/api/api/preprocessor.md
Expand Up @@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor)
#### PreProcessor.\_\_init\_\_

```python
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en", id_hash_keys: Optional[List[str]] = None)
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None)
```

**Arguments**:
Expand All @@ -64,6 +64,7 @@ Set the value to 0 to ensure there is no overlap among the documents after split
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
- `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand Down
12 changes: 12 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Expand Up @@ -3572,6 +3572,18 @@
"default": true,
"type": "boolean"
},
"tokenizer_model_folder": {
"title": "Tokenizer Model Folder",
"anyOf": [
{
"type": "string"
},
{
"type": "string",
"format": "path"
}
]
},
"language": {
"title": "Language",
"default": "en",
Expand Down
1 change: 1 addition & 0 deletions haystack/nodes/preprocessor/base.py
@@ -1,6 +1,7 @@
from typing import List, Optional, Union

from abc import abstractmethod

from haystack.nodes.base import BaseComponent
from haystack.schema import Document

Expand Down
68 changes: 65 additions & 3 deletions haystack/nodes/preprocessor/preprocessor.py
Expand Up @@ -5,6 +5,8 @@
from itertools import chain
from typing import List, Optional, Generator, Set, Union
import warnings
from pathlib import Path
from pickle import UnpicklingError

import nltk
from more_itertools import windowed
Expand Down Expand Up @@ -51,6 +53,7 @@ def __init__(
split_length: int = 200,
split_overlap: int = 0,
split_respect_sentence_boundary: bool = True,
tokenizer_model_folder: Optional[Union[str, Path]] = None,
language: str = "en",
id_hash_keys: Optional[List[str]] = None,
):
Expand All @@ -75,6 +78,7 @@ def __init__(
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
:param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
:param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
Expand All @@ -95,7 +99,8 @@ def __init__(
self.split_length = split_length
self.split_overlap = split_overlap
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.language = iso639_to_nltk.get(language, language)
self.language = language
self.tokenizer_model_folder = tokenizer_model_folder
self.print_log: Set[str] = set()
self.id_hash_keys = id_hash_keys

Expand Down Expand Up @@ -229,6 +234,11 @@ def clean(
# Mainly needed for type checking
if not isinstance(document, Document):
raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")

if type(document.content) is not str:
logger.error("Document content is not of type str. Nothing to clean.")
return document

text = document.content
if clean_header_footer:
text = self._find_and_remove_header_footer(
Expand Down Expand Up @@ -286,11 +296,16 @@ def split(
if split_respect_sentence_boundary and split_by != "word":
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")

if type(document.content) is not str:
logger.error("Document content is not of type str. Nothing to split.")
return [document]

text = document.content

if split_respect_sentence_boundary and split_by == "word":
# split by words ensuring no sub sentence splits
sentences = nltk.tokenize.sent_tokenize(text, language=self.language)
sentences = self._split_sentences(text)

word_count = 0
list_splits = []
current_slice: List[str] = []
Expand Down Expand Up @@ -334,7 +349,7 @@ def split(
if split_by == "passage":
elements = text.split("\n\n")
elif split_by == "sentence":
elements = nltk.tokenize.sent_tokenize(text, language=self.language)
elements = self._split_sentences(text)
elif split_by == "word":
elements = text.split(" ")
else:
Expand Down Expand Up @@ -444,3 +459,50 @@ def _find_longest_common_ngram(
# no common sequence found
longest = ""
return longest if longest.strip() else None

def _split_sentences(self, text: str) -> List[str]:
"""
Tokenize text into sentences.
:param text: str, text to tokenize
:return: list[str], list of sentences
"""
sentences = []

language_name = iso639_to_nltk.get(self.language)

# Try to load a custom model from 'tokenizer_model_path'
if self.tokenizer_model_folder is not None:
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
try:
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
sentences = sentence_tokenizer.tokenize(text)
except LookupError:
logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
except (UnpicklingError, ValueError) as e:
logger.exception(
f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
)
if sentences:
return sentences

# NLTK failed to split, fallback to the default model or to English
if language_name is not None:
logger.error(
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
)
return nltk.tokenize.sent_tokenize(text, language=language_name)

logger.error(
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
)
return nltk.tokenize.sent_tokenize(text, language="english")

# Use a default NLTK model
if language_name is not None:
return nltk.tokenize.sent_tokenize(text, language=language_name)

logger.error(
f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
"You may train your own model and use the 'tokenizer_model_folder' parameter."
)
return nltk.tokenize.sent_tokenize(text, language="english")
100 changes: 86 additions & 14 deletions test/nodes/test_preprocessor.py
@@ -1,5 +1,6 @@
import sys
from pathlib import Path
import os

import pytest

Expand All @@ -9,6 +10,10 @@

from ..conftest import SAMPLES_PATH


NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"


TEXT = """
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
Expand All @@ -21,20 +26,90 @@
in the sentence.
"""

LEGAL_TEXT_PT = """
A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
bens imóveis, é norma especial e posterior ao Código de Defesa do
Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
25/8/2020).
A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
denúncia contra o Senador ou Deputado, por crime ocorrido após a
diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
por iniciativa de partido político nela representado e pelo voto da maioria de
seus membros, poderá, até a decisão final, sustar o andamento da ação”.
Vale ressaltar, contudo, que existem, antes do encaminhamento ao
Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
redação final aprovada. O projeto aprovado será encaminhado em autógrafos
ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
do RICD e arts. 328 a 331 do RISF.
"""


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split(split_length_and_results):
split_length, expected_documents_count = split_length_and_results

def test_preprocess_sentence_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 15
assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
split_length, expected_documents_count = split_length_and_results

document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
language="en",
)
documents = preprocessor.process(document)
assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
split_length, expected_documents_count = split_length_and_results

document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
language="ca",
)
documents = preprocessor.process(document)
assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
def test_preprocess_sentence_split_custom_models(split_length_and_results):
split_length, expected_documents_count = split_length_and_results

document = Document(content=LEGAL_TEXT_PT)
preprocessor = PreProcessor(
split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
language="pt",
tokenizer_model_folder=NLTK_TEST_MODELS,
)
documents = preprocessor.process(document)
assert len(documents) == 2
assert len(documents) == expected_documents_count


def test_preprocess_word_split():
Expand Down Expand Up @@ -64,19 +139,16 @@ def test_preprocess_word_split():
assert len(documents) == 15


def test_preprocess_passage_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 3
@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
def test_preprocess_passage_split(split_length_and_results):
split_length, expected_documents_count = split_length_and_results

document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 2
assert len(documents) == expected_documents_count


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
Expand Down
Binary file not shown.
2 changes: 2 additions & 0 deletions test/samples/preprocessor/nltk_models/wrong/en.pickle
@@ -0,0 +1,2 @@
This is a text file, not a real PunktSentenceTokenizer model.
Loading it should not work on sentence tokenizer.

0 comments on commit 3948b99

Please sign in to comment.