Skip to content

Commit

Permalink
Merge pull request #188 from diyclassics/master
Browse files Browse the repository at this point in the history
Update Latin tokenizer to handle more 'est' contractions
  • Loading branch information
kylepjohnson committed Mar 10, 2016
2 parents bf71a6e + 1a12b3d commit 1fd697d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
12 changes: 10 additions & 2 deletions cltk/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,16 @@ def test_latin_word_tokenizer(self):
# - Prop. 2.5.1-2
# - Ov. Am. 1.8.65-66
# - Cic. Phillip. 13.14
# - Plaut. Capt. 937
# - Lucr. DRN. 5.1351-53

tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.']
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
'Quid opust verbis? lingua nullast qua negem quidquid roges.',
'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.'
]

results = []

Expand All @@ -76,7 +81,10 @@ def test_latin_word_tokenizer(self):
target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']]
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.'],
['Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est', 'qua', 'negem', 'quidquid', 'roges.'],
['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'c', 'ne', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'que', 'scapi', 'sonantes.']
]

self.assertEqual(results, target)

Expand Down
22 changes: 15 additions & 7 deletions cltk/tokenize/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ def __init__(self, language):
self.available_languages) # pylint: disable=line-too-long

if self.language == 'latin':
self.enclitics = ['que', 'ne', 'ue', 've', 'cum','mst']
# self.enclitics = ['que', 'mst'] #, 'ne', 'ue', 've', 'cum','mst']
self.enclitics = ['que', 'ne', 'ue', 've', 'cum','st']

self.inclusions = []

Expand All @@ -41,6 +40,7 @@ def __init__(self, language):
ue_exceptions = []
ve_exceptions = []
cum_exceptions = []
st_exceptions = []

# quisque
que_exceptions += ['quisque', 'quidque', 'quicque', 'quodque', 'cuiusque', 'cuique',
Expand Down Expand Up @@ -151,11 +151,15 @@ def __init__(self, language):
'nave', 'neve', 'nive', 'praegrave', 'prospicve', 'proterve', 'remove',
'resolve', 'saeve', 'salve', 'sive', 'solve', 'summove', 'vive', 'vove']

st_exceptions += ['abest', 'adest', 'ast', 'deest', 'est', 'inest', 'interest', 'post', 'potest', 'prodest', 'subest', 'superest']

self.exceptions = list(set(self.exceptions
+ que_exceptions
+ ne_exceptions
+ ue_exceptions
+ ve_exceptions))
+ ve_exceptions
+ st_exceptions
))

self.inclusions = list(set(self.inclusions
+ cum_inclusions))
Expand All @@ -171,13 +175,17 @@ def tokenize(self, string):
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'mst':
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
elif enclitic == 'cum':
if enclitic == 'cum':
if generic_token in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
specific_tokens += [generic_token]
elif enclitic == 'st':
if generic_token.endswith('ust'):
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
else:
# Does not handle 'similist', 'qualist', etc. correctly
specific_tokens += [generic_token[:-len(enclitic)]] + ['est']
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
Expand Down

0 comments on commit 1fd697d

Please sign in to comment.