Skip to content

Commit

Permalink
Merge pull request #189 from diyclassics/master
Browse files Browse the repository at this point in the history
Update Latin word tokenizer to handle more contractions
  • Loading branch information
kylepjohnson committed Mar 11, 2016
2 parents 1fd697d + eac1a34 commit 3af35d6
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
7 changes: 5 additions & 2 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ def test_latin_word_tokenizer(self):
# - Cic. Phillip. 13.14
# - Plaut. Capt. 937
# - Lucr. DRN. 5.1351-53
# - Plaut. Bacch. 837-38

tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
'Quid opust verbis? lingua nullast qua negem quidquid roges.',
'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.'
'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.',
'Dic sodes mihi, bellan videtur specie mulier?'
]

results = []
Expand All @@ -83,7 +85,8 @@ def test_latin_word_tokenizer(self):
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.'],
['Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est', 'qua', 'negem', 'quidquid', 'roges.'],
['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'c', 'ne', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'que', 'scapi', 'sonantes.']
['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'c', 'ne', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'que', 'scapi', 'sonantes.'],
['Dic', 'si', 'audes', 'mihi', ',', 'bellan', 'videtur', 'specie', 'mulier', '?']
]

self.assertEqual(results, target)
Expand Down
3 changes: 3 additions & 0 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ def tokenize(self, string):
"""Tokenize incoming string."""
punkt = PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
# Rewrite as an if-else block for exceptions rather than separate list comprehensions
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sodes' else ['si', 'audes'])] # Handle 'sodes' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sultis' else ['si', 'vultis'])] # Handle 'sultis' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
Expand Down

0 comments on commit 3af35d6

Please sign in to comment.