Skip to content

Commit

Permalink
Merge pull request #192 from diyclassics/master
Browse files Browse the repository at this point in the history
Uppercase handling in Latin tokenizer
  • Loading branch information
kylepjohnson committed Mar 11, 2016
2 parents 3af35d6 + a929254 commit 2ceeddd
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
2 changes: 1 addition & 1 deletion cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_latin_word_tokenizer(self):

target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['c', 'Ne', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.'],
['Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est', 'qua', 'negem', 'quidquid', 'roges.'],
['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'c', 'ne', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'que', 'scapi', 'sonantes.'],
Expand Down
8 changes: 4 additions & 4 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,17 @@ def tokenize(self, string):
punkt = PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
# Rewrite as an if-else block for exceptions rather than separate list comprehensions
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sodes' else ['si', 'audes'])] # Handle 'sodes' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sultis' else ['si', 'vultis'])] # Handle 'sultis' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'nec' else ['c', item[:-1]])] # Handle 'nec' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'sodes' else [item[0]+'i', 'audes'])] # Handle 'sodes' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'sultis' else [item[0]+'i', 'vultis'])] # Handle 'sultis' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'cum':
if generic_token in self.inclusions:
if generic_token.lower() in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
Expand Down

0 comments on commit 2ceeddd

Please sign in to comment.