Merge pull request #143 from diyclassics/master

Update Latin tokenizer to handle -cum compounds
cltk · Feb 24, 2016 · edcc16a · edcc16a
2 parents d8f4d44 + 5c94443
commit edcc16a
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 21 deletions.
diff --git a/cltk/tests/test_tokenize.py b/cltk/tests/test_tokenize.py
@@ -51,14 +51,31 @@ def test_sentence_tokenizer_greek(self):
         tokenized_sentences = tokenizer.tokenize_sentences(sentences)
         self.assertEqual(len(tokenized_sentences), len(good_tokenized_sentences))
     '''
-
+        
     def test_latin_word_tokenizer(self):
         """Test Latin-specific word tokenizer."""
         word_tokenizer = WordTokenizer('latin')
-        text = 'Arma virumque cano, Troiae qui primus ab oris. Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?'
-        tokens = word_tokenizer.tokenize(text)
-        target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.', 'Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?']
-        self.assertEqual(tokens, target)
+
+        #Test sources:
+        # - V. Aen. 1.1
+        # - Prop. 2.5.1-2
+        # - Ov. Am. 1.8.65-66
+
+        tests = ['Arma virumque cano, Troiae qui primus ab oris.',
+                    'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
+                    'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!']
+
+        results = []
+
+        for test in tests:
+            result = word_tokenizer.tokenize(test)
+            results.append(result)
+
+        target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
+                    ['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
+                    ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!']]
+
+        self.assertEqual(results, target)
 
     def test_nltk_tokenize_words(self):
         """Test wrapper for NLTK's PunktLanguageVars()"""

diff --git a/cltk/tokenize/word.py b/cltk/tokenize/word.py
@@ -28,6 +28,12 @@ def __init__(self, language):
 
         if self.language == 'latin':
             self.enclitics = ['que', 'ne', 'ue', 've', 'cum','mst']
+#            self.enclitics = ['que', 'mst'] #, 'ne', 'ue', 've', 'cum','mst']
+
+            self.inclusions = []
+
+            cum_inclusions = ['mecum', 'tecum', 'secum', 'nobiscum', 'vobiscum', 'quocum', 'quicum' 'quibuscum']
+
             self.exceptions = self.enclitics
 
             que_exceptions = []
@@ -145,24 +151,14 @@ def __init__(self, language):
                               'nave', 'neve', 'nive', 'praegrave', 'prospicve', 'proterve', 'remove',
                               'resolve', 'saeve', 'salve', 'sive', 'solve', 'summove', 'vive', 'vove']
 
-            # checked against lucan, propertius, tibullus, ovid (elegy), virgil (aeneid)
-            cum_exceptions += ['actiacum', 'amicum', 'amycum', 'anticum', 'arcum', 'argolicum', 'cacum',
-                               'caecum', 'cappadocum', 'cilicum', 'circum', 'cornicum', 'coruscum',
-                               'crocum', 'cupencum', 'ducum', 'fatidicum', 'focum', 'glaucum',
-                               'horrificum', 'inicum', 'inimicum', 'iocum', 'iuuencum', 'iuvencum',
-                               'lacum', 'laticum', 'libycum', 'locum', 'lucum', 'magnificum',
-                               'meretricum', 'metiscum', 'modicum', 'nutricum', 'oblicum', 'opacum',
-                               'phaeacum', 'phoenicum', 'priscum', 'propincum', 'pudicum', 'quercum',
-                               'quicum', 'raucum', 'saetacum', 'salaminiacum', 'scythicum', 'siccum',
-                               'silicum', 'tabificum', 'thessalicum', 'truncum', 'uiscum', 'uncum',
-                               'uocum', 'viscum', 'vocum']
-
             self.exceptions = list(set(self.exceptions
                                        + que_exceptions
                                        + ne_exceptions
                                        + ue_exceptions
-                                       + ve_exceptions
-                                       + cum_exceptions))
+                                       + ve_exceptions))
+
+            self.inclusions = list(set(self.inclusions
+                                       + cum_inclusions))
 
     def tokenize(self, string):
         """Tokenize incoming string."""
@@ -175,7 +171,12 @@ def tokenize(self, string):
                 for enclitic in self.enclitics:
                     if generic_token.endswith(enclitic):
                         if enclitic == 'mst':
-                            specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]                        
+                            specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
+                        elif enclitic == 'cum':
+                            if generic_token in self.inclusions:
+                                specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
+                            else:
+                                specific_tokens += [generic_token]                                                     
                         else:
                             specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                         is_enclitic = True
@@ -211,4 +212,4 @@ def nltk_tokenize_words(string, attached_period=False):
             new_tokens.append('.')
         else:
             new_tokens.append(word)
-    return new_tokens
+    return new_tokens