Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Latin Tokenizer) Add flexibility and finetuning power over enclitics handling #972

Merged
merged 5 commits into from Mar 11, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 32 additions & 7 deletions cltk/tokenize/latin/word.py
Expand Up @@ -5,7 +5,7 @@
__license__ = 'MIT License.'

import re
from typing import List
from typing import List, Tuple

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

Expand All @@ -30,11 +30,20 @@ def __init__(self):
self.sent_tokenizer = PunktSentenceTokenizer(self.punkt_param)
self.word_tokenizer = LatinLanguageVars()

def tokenize(self, text:str) ->List[str]:
def tokenize(self, text: str,
replacements: List[Tuple[str, str]] = REPLACEMENTS,
enclitics_exceptions: List[str] = EXCEPTIONS,
enclitics: List[str] = ENCLITICS

) ->List[str]:
"""
Tokenizer divides the text into a list of substrings

:param text: This accepts the string value that needs to be tokenized
:param replacements: List of replacements to apply to tokens such as "mecum" -> ["cum", "me"]
:param enclitics_exceptions: List of words that look likes they end with an enclitic but are not.
:param enclitics: List of enclitics to check for in tokenization

:returns: A list of substrings extracted from the text

>>> toker = WordTokenizer()
Expand All @@ -51,6 +60,22 @@ def tokenize(self, text:str) ->List[str]:
>>> toker.tokenize('Dic si audes mihi, bellan videtur specie mulier?')
['Dic', 'si', 'audes', 'mihi', ',', 'bella', '-ne', 'videtur', 'specie', 'mulier', '?']

>>> toker.tokenize("mecum")
['cum', 'me']

You can specify how replacements are made using replacements

>>> toker.tokenize("mecum", replacements=[(r"mecum", "me cum")])
['me', 'cum']

Or change enclitics and enclitics exception:
>>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que"])
['atque', 'haec', 'abuter', '-que', 'puerve', 'paterne', 'nihil']

>>> toker.tokenize("atque haec abuterque puerve paterne nihil", enclitics=["que", "ve", "ne"],
... enclitics_exceptions=('paterne', 'atque'))
['atque', 'haec', 'abuter', '-que', 'puer', '-ve', 'paterne', 'nihil']

"""

def matchcase(word):
Expand All @@ -68,19 +93,19 @@ def replace(matching):

return replace

for replacement in REPLACEMENTS:
for replacement in replacements:
text = re.sub(replacement[0], matchcase(replacement[1]), text, flags=re.IGNORECASE)

sents = self.sent_tokenizer.tokenize(text)
tokens = [] # type: List[str]
tokens = [] # type: List[str]

for sent in sents:
temp_tokens = self.word_tokenizer.word_tokenize(sent)
# Need to check that tokens exist before handling them;
# needed to make stream.readlines work in PlaintextCorpusReader
if temp_tokens:
if temp_tokens[0].endswith('ne'):
if temp_tokens[0].lower() not in WordTokenizer.EXCEPTIONS:
if temp_tokens[0].lower() not in enclitics_exceptions:
temp = [temp_tokens[0][:-2], '-ne']
temp_tokens = temp + temp_tokens[1:]
if temp_tokens[-1].endswith('.'):
Expand All @@ -96,8 +121,8 @@ def replace(matching):

for token in tokens:
is_enclitic = False
if token.lower() not in WordTokenizer.EXCEPTIONS:
for enclitic in WordTokenizer.ENCLITICS:
if token.lower() not in enclitics_exceptions:
for enclitic in enclitics:
if token.endswith(enclitic):
if enclitic == 'n':
specific_tokens += [token[:-len(enclitic)]] + ['-ne']
Expand Down