Skip to content

Commit

Permalink
Merge pull request #132 from diyclassics/master
Browse files Browse the repository at this point in the history
Update enclitic handling in Latin word tokenizer. I'll try to make a new version soon.
  • Loading branch information
kylepjohnson committed Feb 20, 2016
2 parents b6e0c1b + 3180c3a commit 3ceb058
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def test_sentence_tokenizer_greek(self):
def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
text = 'atque haec abuterque nihil'
text = 'Arma virumque cano, Troiae qui primus ab oris'
tokens = word_tokenizer.tokenize(text)
target = ['atque', 'haec', 'abuter', '-que', 'nihil']
target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris']
self.assertEqual(tokens, target)

def test_nltk_tokenize_words(self):
Expand Down
4 changes: 2 additions & 2 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def tokenize(self, string):
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
new_tokens = [generic_token[:-len(enclitic)]] + ['-' + enclitic]
new_tokens = [enclitic] + [generic_token[:-len(enclitic)]]
specific_tokens += new_tokens
is_enclitic = True
break
Expand Down Expand Up @@ -211,4 +211,4 @@ def nltk_tokenize_words(string, attached_period=False):
new_tokens.append('.')
else:
new_tokens.append(word)
return new_tokens
return new_tokens

0 comments on commit 3ceb058

Please sign in to comment.