Skip to content

Commit

Permalink
Merge branch 'master' of github.com:cltk/cltk
Browse files Browse the repository at this point in the history
  • Loading branch information
kylepjohnson committed Feb 24, 2016
2 parents c9e92e0 + ba42633 commit c0e5044
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
4 changes: 2 additions & 2 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def test_sentence_tokenizer_greek(self):
def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
text = 'Arma virumque cano, Troiae qui primus ab oris'
text = 'Arma virumque cano, Troiae qui primus ab oris. Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?'
tokens = word_tokenizer.tokenize(text)
target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris']
target = ['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.', 'Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?']
self.assertEqual(tokens, target)

def test_nltk_tokenize_words(self):
Expand Down
12 changes: 6 additions & 6 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, language):
self.available_languages) # pylint: disable=line-too-long

if self.language == 'latin':
self.enclitics = ['que', 'ne', 'ue', 've', 'cum']
self.enclitics = ['que', 'ne', 'ue', 've', 'cum','mst']
self.exceptions = self.enclitics

que_exceptions = []
Expand Down Expand Up @@ -174,16 +174,16 @@ def tokenize(self, string):
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
new_tokens = [enclitic] + [generic_token[:-len(enclitic)]]
specific_tokens += new_tokens
if enclitic == 'mst':
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
break
if not is_enclitic:
specific_tokens.append(generic_token)

return specific_tokens


def nltk_tokenize_words(string, attached_period=False):
"""Wrap NLTK's tokenizer PunktLanguageVars(), but make final period
its own token.
Expand Down Expand Up @@ -211,4 +211,4 @@ def nltk_tokenize_words(string, attached_period=False):
new_tokens.append('.')
else:
new_tokens.append(word)
return new_tokens
return new_tokens

0 comments on commit c0e5044

Please sign in to comment.