Skip to content

Commit

Permalink
Merge pull request #200 from diyclassics/master
Browse files Browse the repository at this point in the history
Update Latin tokenizer to handle '-n'
  • Loading branch information
kylepjohnson committed Mar 15, 2016
2 parents 7109dcb + 25da02f commit d287fe7
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
7 changes: 5 additions & 2 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,16 @@ def test_latin_word_tokenizer(self):
# - Plaut. Capt. 937
# - Lucr. DRN. 5.1351-53
# - Plaut. Bacch. 837-38
# - Plaut. Amph. 823

tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
'Quid opust verbis? lingua nullast qua negem quidquid roges.',
'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.',
'Dic sodes mihi, bellan videtur specie mulier?'
'Dic sodes mihi, bellan videtur specie mulier?',
'Cenavin ego heri in navi in portu Persico?'
]

results = []
Expand All @@ -86,7 +88,8 @@ def test_latin_word_tokenizer(self):
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.'],
['Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est', 'qua', 'negem', 'quidquid', 'roges.'],
['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'c', 'ne', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'que', 'scapi', 'sonantes.'],
['Dic', 'si', 'audes', 'mihi', ',', 'bellan', 'videtur', 'specie', 'mulier', '?']
['Dic', 'si', 'audes', 'mihi', ',', 'bella', 'ne', 'videtur', 'specie', 'mulier', '?'],
['Cenavi', 'ne', 'ego', 'heri', 'in', 'navi', 'in', 'portu', 'Persico', '?']
]

self.assertEqual(results, target)
Expand Down
14 changes: 11 additions & 3 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, language):
self.available_languages) # pylint: disable=line-too-long

if self.language == 'latin':
self.enclitics = ['que', 'ne', 'ue', 've', 'cum','st']
self.enclitics = ['que', 'n', 'ne', 'ue', 've', 'cum','st']

self.inclusions = []

Expand All @@ -36,6 +36,7 @@ def __init__(self, language):
self.exceptions = self.enclitics

que_exceptions = []
n_exceptions = []
ne_exceptions = []
ue_exceptions = []
ve_exceptions = []
Expand Down Expand Up @@ -136,6 +137,8 @@ def __init__(self, language):
'tisiphone', 'torone', 'transitione', 'troiane', 'turbine', 'turne',
'tyrrhene', 'uane', 'uelamine', 'uertigine', 'uesane', 'uimine', 'uirgine',
'umbone', 'unguine', 'uolumine', 'uoragine', 'urbane', 'uulcane', 'zone']

n_exceptions += ['aenean', 'agmen', 'alioquin', 'an', 'attamen', 'carmen', 'certamen', 'cognomen', 'crimen', 'dein', 'discrimen', 'en', 'epitheton', 'exin', 'flumen', 'forsan', 'forsitan', 'fulmen', 'iason', 'in', 'limen', 'liquamen', 'lumen', 'nomen', 'non', 'numen', 'omen', 'orion', 'quin', 'semen', 'specimen', 'tamen', 'titan']

ue_exceptions += ['agaue', 'ambigue', 'assidue', 'aue', 'boue', 'breue', 'calue', 'caue',
'ciue', 'congrue', 'contigue', 'continue', 'curue', 'exigue', 'exue',
Expand All @@ -156,6 +159,7 @@ def __init__(self, language):
self.exceptions = list(set(self.exceptions
+ que_exceptions
+ ne_exceptions
+ n_exceptions
+ ue_exceptions
+ ve_exceptions
+ st_exceptions
Expand All @@ -171,7 +175,9 @@ def tokenize(self, string):
# Rewrite as an if-else block for exceptions rather than separate list comprehensions
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'nec' else ['c', item[:-1]])] # Handle 'nec' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'sodes' else [item[0]+'i', 'audes'])] # Handle 'sodes' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'sultis' else [item[0]+'i', 'vultis'])] # Handle 'sultis' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'sultis' else [item[0]+'i', 'vultis'])] # Handle 'sultis' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'satin' else [item[:-1] + 's', 'ne'])] # Handle 'satin' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item.lower() != 'scin' else [item[:-1] + 's', 'ne'])] # Handle 'scin' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
Expand All @@ -182,7 +188,9 @@ def tokenize(self, string):
if generic_token.lower() in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
specific_tokens += [generic_token]
elif enclitic == 'n':
specific_tokens += [generic_token[:-len(enclitic)]] + ['ne']
elif enclitic == 'st':
if generic_token.endswith('ust'):
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
Expand Down

0 comments on commit d287fe7

Please sign in to comment.