Merge pull request #132 from diyclassics/master

Update enclitic handling in Latin word tokenizer. I'll try to make a new version soon.
cltk · Feb 20, 2016 · 3ceb058 · 3ceb058
2 parents b6e0c1b + 3180c3a
commit 3ceb058
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/cltk/tests/test_tokenize.py b/cltk/tests/test_tokenize.py
@@ -55,9 +55,9 @@ def test_sentence_tokenizer_greek(self):
     def test_latin_word_tokenizer(self):
         """Test Latin-specific word tokenizer."""
         word_tokenizer = WordTokenizer('latin')
-        text = 'atque haec abuterque nihil'
+        text = 'Arma virumque cano, Troiae qui primus ab oris'
         tokens = word_tokenizer.tokenize(text)
-        target = ['atque', 'haec', 'abuter', '-que', 'nihil']
+        target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris']
         self.assertEqual(tokens, target)
 
     def test_nltk_tokenize_words(self):

diff --git a/cltk/tokenize/word.py b/cltk/tokenize/word.py
@@ -174,7 +174,7 @@ def tokenize(self, string):
             if generic_token not in self.exceptions:
                 for enclitic in self.enclitics:
                     if generic_token.endswith(enclitic):
-                        new_tokens = [generic_token[:-len(enclitic)]] + ['-' + enclitic]
+                        new_tokens = [enclitic] + [generic_token[:-len(enclitic)]]
                         specific_tokens += new_tokens
                         is_enclitic = True
                         break
@@ -211,4 +211,4 @@ def nltk_tokenize_words(string, attached_period=False):
             new_tokens.append('.')
         else:
             new_tokens.append(word)
-    return new_tokens
+    return new_tokens