Merge branch 'master' of github.com:cltk/cltk

cltk · Feb 24, 2016 · c0e5044 · c0e5044
2 parents c9e92e0 + ba42633
commit c0e5044
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/cltk/tests/test_tokenize.py b/cltk/tests/test_tokenize.py
@@ -55,9 +55,9 @@ def test_sentence_tokenizer_greek(self):
     def test_latin_word_tokenizer(self):
         """Test Latin-specific word tokenizer."""
         word_tokenizer = WordTokenizer('latin')
-        text = 'Arma virumque cano, Troiae qui primus ab oris'
+        text = 'Arma virumque cano, Troiae qui primus ab oris. Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?'
         tokens = word_tokenizer.tokenize(text)
-        target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris']
+        target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.', 'Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?']
         self.assertEqual(tokens, target)
 
     def test_nltk_tokenize_words(self):

diff --git a/cltk/tokenize/word.py b/cltk/tokenize/word.py
@@ -27,7 +27,7 @@ def __init__(self, language):
                                                                                             self.available_languages)  # pylint: disable=line-too-long
 
         if self.language == 'latin':
-            self.enclitics = ['que', 'ne', 'ue', 've', 'cum']
+            self.enclitics = ['que', 'ne', 'ue', 've', 'cum','mst']
             self.exceptions = self.enclitics
 
             que_exceptions = []
@@ -174,16 +174,16 @@ def tokenize(self, string):
             if generic_token not in self.exceptions:
                 for enclitic in self.enclitics:
                     if generic_token.endswith(enclitic):
-                        new_tokens = [enclitic] + [generic_token[:-len(enclitic)]]
-                        specific_tokens += new_tokens
+                        if enclitic == 'mst':
+                            specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]                        
+                        else:
+                            specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                         is_enclitic = True
                         break
             if not is_enclitic:
                 specific_tokens.append(generic_token)
-
         return specific_tokens
 
-
 def nltk_tokenize_words(string, attached_period=False):
     """Wrap NLTK's tokenizer PunktLanguageVars(), but make final period
     its own token.
@@ -211,4 +211,4 @@ def nltk_tokenize_words(string, attached_period=False):
             new_tokens.append('.')
         else:
             new_tokens.append(word)
-    return new_tokens
+    return new_tokens