Skip to content

Commit

Permalink
Merge pull request #146 from diyclassics/master
Browse files Browse the repository at this point in the history
Update Latin word tokenizer to handle 'nec'
  • Loading branch information
kylepjohnson committed Feb 26, 2016
2 parents 1586286 + 4326a1f commit dbc2aae
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_latin_word_tokenizer(self):
target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']]
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']]

self.assertEqual(results, target)

Expand Down
1 change: 1 addition & 0 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def tokenize(self, string):
"""Tokenize incoming string."""
punkt = PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
Expand Down

0 comments on commit dbc2aae

Please sign in to comment.