Skip to content
Permalink
Browse files

(Latin Tokenizer) Add flexibility and finetuning power over enclitics…

… handling (#972)

Co-authored-by: Kyle P. Johnson <kyle@kyle-p-johnson.com>
  • Loading branch information
PonteIneptique and kylepjohnson committed Mar 11, 2020
1 parent 3ffd04c commit 9b3e34313dd0ab6a7fe29889be62ab72023b2cdf
Showing with 32 additions and 7 deletions.
  1. +32 −7 cltk/tokenize/latin/word.py
@@ -5,7 +5,7 @@
__license__ = 'MIT License.'

import re
from typing import List
from typing import List, Tuple

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

@@ -30,11 +30,20 @@ def __init__(self):
self.sent_tokenizer = PunktSentenceTokenizer(self.punkt_param)
self.word_tokenizer = LatinLanguageVars()

def tokenize(self, text:str) ->List[str]:
def tokenize(self, text: str,
replacements: List[Tuple[str, str]] = REPLACEMENTS,
enclitics_exceptions: List[str] = EXCEPTIONS,
enclitics: List[str] = ENCLITICS

) ->List[str]:
"""
Tokenizer divides the text into a list of substrings
:param text: This accepts the string value that needs to be tokenized
:param replacements: List of replacements to apply to tokens such as "mecum" -> ["cum", "me"]
:param enclitics_exceptions: List of words that look likes they end with an enclitic but are not.
:param enclitics: List of enclitics to check for in tokenization
:returns: A list of substrings extracted from the text
>>> toker = WordTokenizer()
@@ -51,6 +60,22 @@ def tokenize(self, text:str) ->List[str]:
>>> toker.tokenize('Dic si audes mihi, bellan videtur specie mulier?')
['Dic', 'si', 'audes', 'mihi', ',', 'bella', '-ne', 'videtur', 'specie', 'mulier', '?']
>>> toker.tokenize("mecum")
['cum', 'me']
You can specify how replacements are made using replacements
>>> toker.tokenize("mecum", replacements=[(r"mecum", "me cum")])
['me', 'cum']
Or change enclitics and enclitics exception:
>>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que"])
['atque', 'haec', 'abuter', '-que', 'puerve', 'paterne', 'nihil']
>>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que", "ve", "ne"],
... enclitics_exceptions=('paterne', 'atque'))
['atque', 'haec', 'abuter', '-que', 'puer', '-ve', 'paterne', 'nihil']
"""

def matchcase(word):
@@ -68,19 +93,19 @@ def replace(matching):

return replace

for replacement in REPLACEMENTS:
for replacement in replacements:
text = re.sub(replacement[0], matchcase(replacement[1]), text, flags=re.IGNORECASE)

sents = self.sent_tokenizer.tokenize(text)
tokens = [] # type: List[str]
tokens = [] # type: List[str]

for sent in sents:
temp_tokens = self.word_tokenizer.word_tokenize(sent)
# Need to check that tokens exist before handling them;
# needed to make stream.readlines work in PlaintextCorpusReader
if temp_tokens:
if temp_tokens[0].endswith('ne'):
if temp_tokens[0].lower() not in WordTokenizer.EXCEPTIONS:
if temp_tokens[0].lower() not in enclitics_exceptions:
temp = [temp_tokens[0][:-2], '-ne']
temp_tokens = temp + temp_tokens[1:]
if temp_tokens[-1].endswith('.'):
@@ -96,8 +121,8 @@ def replace(matching):

for token in tokens:
is_enclitic = False
if token.lower() not in WordTokenizer.EXCEPTIONS:
for enclitic in WordTokenizer.ENCLITICS:
if token.lower() not in enclitics_exceptions:
for enclitic in enclitics:
if token.endswith(enclitic):
if enclitic == 'n':
specific_tokens += [token[:-len(enclitic)]] + ['-ne']

0 comments on commit 9b3e343

Please sign in to comment.
You can’t perform that action at this time.