Skip to content

Commit

Permalink
Merge pull request #143 from diyclassics/master
Browse files Browse the repository at this point in the history
Update Latin tokenizer to handle -cum compounds
  • Loading branch information
kylepjohnson committed Feb 24, 2016
2 parents d8f4d44 + 5c94443 commit edcc16a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 21 deletions.
27 changes: 22 additions & 5 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,31 @@ def test_sentence_tokenizer_greek(self):
tokenized_sentences = tokenizer.tokenize_sentences(sentences)
self.assertEqual(len(tokenized_sentences), len(good_tokenized_sentences))
'''

def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
text = 'Arma virumque cano, Troiae qui primus ab oris. Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?'
tokens = word_tokenizer.tokenize(text)
target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.', 'Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?']
self.assertEqual(tokens, target)

#Test sources:
# - V. Aen. 1.1
# - Prop. 2.5.1-2
# - Ov. Am. 1.8.65-66

tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!']

results = []

for test in tests:
result = word_tokenizer.tokenize(test)
results.append(result)

target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!']]

self.assertEqual(results, target)

def test_nltk_tokenize_words(self):
"""Test wrapper for NLTK's PunktLanguageVars()"""
Expand Down
33 changes: 17 additions & 16 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ def __init__(self, language):

if self.language == 'latin':
self.enclitics = ['que', 'ne', 'ue', 've', 'cum','mst']
# self.enclitics = ['que', 'mst'] #, 'ne', 'ue', 've', 'cum','mst']

self.inclusions = []

cum_inclusions = ['mecum', 'tecum', 'secum', 'nobiscum', 'vobiscum', 'quocum', 'quicum' 'quibuscum']

self.exceptions = self.enclitics

que_exceptions = []
Expand Down Expand Up @@ -145,24 +151,14 @@ def __init__(self, language):
'nave', 'neve', 'nive', 'praegrave', 'prospicve', 'proterve', 'remove',
'resolve', 'saeve', 'salve', 'sive', 'solve', 'summove', 'vive', 'vove']

# checked against lucan, propertius, tibullus, ovid (elegy), virgil (aeneid)
cum_exceptions += ['actiacum', 'amicum', 'amycum', 'anticum', 'arcum', 'argolicum', 'cacum',
'caecum', 'cappadocum', 'cilicum', 'circum', 'cornicum', 'coruscum',
'crocum', 'cupencum', 'ducum', 'fatidicum', 'focum', 'glaucum',
'horrificum', 'inicum', 'inimicum', 'iocum', 'iuuencum', 'iuvencum',
'lacum', 'laticum', 'libycum', 'locum', 'lucum', 'magnificum',
'meretricum', 'metiscum', 'modicum', 'nutricum', 'oblicum', 'opacum',
'phaeacum', 'phoenicum', 'priscum', 'propincum', 'pudicum', 'quercum',
'quicum', 'raucum', 'saetacum', 'salaminiacum', 'scythicum', 'siccum',
'silicum', 'tabificum', 'thessalicum', 'truncum', 'uiscum', 'uncum',
'uocum', 'viscum', 'vocum']

self.exceptions = list(set(self.exceptions
+ que_exceptions
+ ne_exceptions
+ ue_exceptions
+ ve_exceptions
+ cum_exceptions))
+ ve_exceptions))

self.inclusions = list(set(self.inclusions
+ cum_inclusions))

def tokenize(self, string):
"""Tokenize incoming string."""
Expand All @@ -175,7 +171,12 @@ def tokenize(self, string):
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'mst':
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
elif enclitic == 'cum':
if generic_token in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
Expand Down Expand Up @@ -211,4 +212,4 @@ def nltk_tokenize_words(string, attached_period=False):
new_tokens.append('.')
else:
new_tokens.append(word)
return new_tokens
return new_tokens

0 comments on commit edcc16a

Please sign in to comment.