Skip to content
Permalink
Browse files

Add parameter for punctuation used by Latin punkt sentence tokenizer (#…

…944)

* Update Latin punc params

* Add strict punctuation to sentence tokenizer

* Clean up module

* Fix & clean up test

* Update tests

* Remove main function

* Update docs

* Add strict to generic sentence tokenizer
  • Loading branch information...
diyclassics authored and kylepjohnson committed Oct 7, 2019
1 parent a58c2e4 commit f5386c5895c56370eb02bf7f243e38daec6d2e7d
@@ -50,10 +50,16 @@ def test_sentence_tokenizer_latin_punkt(self):
'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!',
'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.'] # pylint: disable=line-too-long
tokenizer = LatinPunktSentenceTokenizer()
print(tokenizer.models_path)
tokenized_sentences = tokenizer.tokenize(self.latin_text)
self.assertEqual(tokenized_sentences, target)

def test_sentence_tokenizer_latin_punkt_strict(self):
"""Test tokenizing Latin sentences with stricter punctuation."""
target = ['in principio creavit Deus caelum et terram;', 'terra autem erat inanis et vacua et tenebrae super faciem abyssi et spiritus Dei ferebatur super aquas;', 'dixitque Deus fiat lux et facta est lux;', 'et vidit Deus lucem quod esset bona et divisit lucem ac tenebras.'] # pylint: disable=line-too-long
tokenizer = LatinPunktSentenceTokenizer(strict=True)
tokenized_sentences = tokenizer.tokenize("""in principio creavit Deus caelum et terram; terra autem erat inanis et vacua et tenebrae super faciem abyssi et spiritus Dei ferebatur super aquas; dixitque Deus fiat lux et facta est lux; et vidit Deus lucem quod esset bona et divisit lucem ac tenebras.""")
self.assertEqual(tokenized_sentences, target)

# Deprecated use cltk.tokenize.latin.sentence
def test_sentence_tokenizer_latin(self):
"""Test tokenizing Latin sentences."""
@@ -154,3 +154,6 @@

class LatinLanguageVars(PunktLanguageVars):
_re_non_word_chars = PunktLanguageVars._re_non_word_chars.replace("'",'')

PUNCTUATION = ('.', '?', '!')
STRICT_PUNCTUATION = PUNCTUATION+('-', ':', ';')
@@ -5,32 +5,41 @@
__license__ = 'MIT License.'

import os.path

import nltk
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import BaseSentenceTokenizer, BasePunktSentenceTokenizer
from cltk.tokenize.latin.params import LatinLanguageVars
from cltk.tokenize.latin.params import LatinLanguageVars, PUNCTUATION, STRICT_PUNCTUATION
from cltk.utils.file_operations import open_pickle

def SentenceTokenizer(tokenizer:str = 'punkt'):
def SentenceTokenizer(tokenizer:str = 'punkt', strict:bool = False):
if tokenizer=='punkt':
return LatinPunktSentenceTokenizer()
return LatinPunktSentenceTokenizer(strict=strict)


class LatinPunktSentenceTokenizer(BasePunktSentenceTokenizer):
""" PunktSentenceTokenizer trained on Latin
"""
models_path = os.path.normpath(get_cltk_data_dir() + '/latin/model/latin_models_cltk/tokenizers/sentence')
missing_models_message = "BackoffLatinLemmatizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus."
missing_models_message = "LatinPunktSentenceTokenizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus."

def __init__(self: object, language:str = 'latin'):
def __init__(self: object, language:str = 'latin', strict:bool = False):
"""
:param language : language for sentence tokenization
:type language: str
:param strict : allow for stricter puctuation for sentence tokenization
:type strict: bool
"""
self.lang_vars = LatinLanguageVars()
self.strict = strict
super().__init__(language='latin', lang_vars=self.lang_vars)
self.models_path = LatinPunktSentenceTokenizer.models_path

try:
self.model = open_pickle(os.path.join(self.models_path, 'latin_punkt.pickle'))
except FileNotFoundError as err:
raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)

if self.strict:
PunktLanguageVars.sent_end_chars=STRICT_PUNCTUATION
else:
PunktLanguageVars.sent_end_chars=PUNCTUATION
@@ -164,11 +164,3 @@ def tokenize(self, untokenized_string: str, model=None):
:param untokenized_string: A string containing one of more sentences.
"""
return self.tokenize_sentences(untokenized_string)

if __name__ == "__main__":
text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।
न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।"""
t = TokenizeSentence('sanskrit')
sents = t.tokenize(text)
for i, sent in enumerate(sents, 1):
print(f'{i}: {sent}')
@@ -753,6 +753,17 @@ Sentence tokenization for Latin is available using a [Punkt](https://www.nltk.or
Note that the Latin sentence tokenizer takes account of abbreviations like 'Kal.' and 'C.' and does not split sentences at these points.

By default, the Latin Punkt Sentence Tokenizer splits on period, question mark, and exclamation point. There is a ```strict``` parameter that adds colon, semicolon, and hyphen to this.

.. code-block:: python
In [5]: sent_tokenizer = SentenceTokenizer(strict=True)

In [6]: untokenized_text = 'In principio creavit Deus caelum et terram; terra autem erat inanis et vacua et tenebrae super faciem abyssi et spiritus Dei ferebatur super aquas; dixitque Deus fiat lux et facta est lux; et vidit Deus lucem quod esset bona et divisit lucem ac tenebras.'

In [7]: sent_tokenizer.tokenize(untokenized_text)
Out[7]: ['In principio creavit Deus caelum et terram;', 'terra autem erat inanis et vacua et tenebrae super faciem abyssi et spiritus Dei ferebatur super aquas;', 'dixitque Deus fiat lux et facta est lux;', 'et vidit Deus lucem quod esset bona et divisit lucem ac tenebras.']

NB: The old method for sentence tokenizer, i.e. TokenizeSentence, is still available, but now calls the tokenizer described above.

.. code-block:: python

0 comments on commit f5386c5

Please sign in to comment.
You can’t perform that action at this time.