In [1]:
# # Install treetaggerwrapper

# !pipenv install treetaggerwrapper

# # See docs for more information:
# # http://treetaggerwrapper.readthedocs.io/en/latest/#installation
#
# #Some installation help for treetagger at the bottom of this notebook

## Working with *treetaggerwrapper*

In [2]:
# Imports

import treetaggerwrapper

from pprint import pprint

In [3]:
# Create Latin tagger

tagger = treetaggerwrapper.TreeTagger(TAGLANG='la')

In [4]:
# Set up test text

# Sall. Bell. Cat. 1
text = """Omnis homines, qui sese student praestare ceteris animalibus, summa ope niti decet, ne vitam silentio transeant veluti pecora, quae natura prona atque ventri oboedientia finxit. Sed nostra omnis vis in animo et corpore sita est: animi imperio, corporis servitio magis utimur; alterum nobis cum dis, alterum cum beluis commune est. Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam maxume longam efficere. Nam divitiarum et formae gloria fluxa atque fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter mortalis certamen fuit, vine corporis an virtute animi res militaris magis procederet. Nam et, prius quam incipias, consulto et, ubi consulueris, mature facto opus est. Ita utrumque per se indigens alterum alterius auxilio eget.
"""

In [5]:
%%time

# Tag with treetagger

print(f'Tagging {len(text.split())} tokens...')
tags = tagger.tag_text(text)

Tagging 125 tokens...
CPU times: user 12.8 ms, sys: 9.63 ms, total: 22.4 ms
Wall time: 2.62 s


In [6]:
# View output from tagger

pprint(tags[:10])

['Omnis\tPRON\tomnis',
 'homines\tN:nom\thomo',
 ',\tPUN\t,',
 'qui\tREL\tqui',
 'sese\tPRON\tsui',
 'student\tV:IND\tstudeo',
 'praestare\tV:INF\tpraesto',
 'ceteris\tADJ:abl\tceterus',
 'animalibus\tN:abl\tanimal|animalis',
 ',\tPUN\t,']


In [7]:
# View output from tagger, delimited by tab

for tag in tags[:10]:
    print(tag.split('\t'))

['Omnis', 'PRON', 'omnis']
['homines', 'N:nom', 'homo']
[',', 'PUN', ',']
['qui', 'REL', 'qui']
['sese', 'PRON', 'sui']
['student', 'V:IND', 'studeo']
['praestare', 'V:INF', 'praesto']
['ceteris', 'ADJ:abl', 'ceterus']
['animalibus', 'N:abl', 'animal|animalis']
[',', 'PUN', ',']


In [8]:
# Format output from tagger as tuples

tags_tuples = treetaggerwrapper.make_tags(tags)
pprint(tags_tuples[:10])

[Tag(word='Omnis', pos='PRON', lemma='omnis'),
 Tag(word='homines', pos='N:nom', lemma='homo'),
 Tag(word=',', pos='PUN', lemma=','),
 Tag(word='qui', pos='REL', lemma='qui'),
 Tag(word='sese', pos='PRON', lemma='sui'),
 Tag(word='student', pos='V:IND', lemma='studeo'),
 Tag(word='praestare', pos='V:INF', lemma='praesto'),
 Tag(word='ceteris', pos='ADJ:abl', lemma='ceterus'),
 Tag(word='animalibus', pos='N:abl', lemma='animal|animalis'),
 Tag(word=',', pos='PUN', lemma=',')]


In [9]:
pprint(tags_tuples)

[Tag(word='Omnis', pos='PRON', lemma='omnis'),
 Tag(word='homines', pos='N:nom', lemma='homo'),
 Tag(word=',', pos='PUN', lemma=','),
 Tag(word='qui', pos='REL', lemma='qui'),
 Tag(word='sese', pos='PRON', lemma='sui'),
 Tag(word='student', pos='V:IND', lemma='studeo'),
 Tag(word='praestare', pos='V:INF', lemma='praesto'),
 Tag(word='ceteris', pos='ADJ:abl', lemma='ceterus'),
 Tag(word='animalibus', pos='N:abl', lemma='animal|animalis'),
 Tag(word=',', pos='PUN', lemma=','),
 Tag(word='summa', pos='ADJ:abl', lemma='summus'),
 Tag(word='ope', pos='N:abl', lemma='ops'),
 Tag(word='niti', pos='V:INF', lemma='nitor'),
 Tag(word='decet', pos='V:IND', lemma='decet'),
 Tag(word=',', pos='PUN', lemma=','),
 Tag(word='ne', pos='CS', lemma='ne'),
 Tag(word='vitam', pos='N:acc', lemma='vita'),
 Tag(word='silentio', pos='N:abl', lemma='silentium'),
 Tag(word='transeant', pos='V:SUB', lemma='transeo'),
 Tag(word='veluti', pos='ADV', lemma='veluti'),
 Tag(word='pecora', pos='N:acc', lemma='pecus'),


In [10]:
# Format output as (token, lemma)

lemma_pairs = [(token, lemma) for token, _, lemma in tags_tuples]
pprint(lemma_pairs[:10])

[('Omnis', 'omnis'),
 ('homines', 'homo'),
 (',', ','),
 ('qui', 'qui'),
 ('sese', 'sui'),
 ('student', 'studeo'),
 ('praestare', 'praesto'),
 ('ceteris', 'ceterus'),
 ('animalibus', 'animal|animalis'),
 (',', ',')]


## Working with *treetagger-python*

In [11]:
# # Install treetagger-python


# # Working off a fork of treetagger-python since the main package does not yet support 'latin'

# !pipenv install git+https://github.com/diyclassics/treetagger-python.git@latin#egg=treetagger-python

# # Also, add to .bash_profile (vel sim):
# # export TREETAGGER_HOME='/path/to/your/TreeTagger/cmd/'

# # See docs for more information:
# # https://github.com/miotto/treetagger-python

# # Some installation help for treetagger at the bottom of this notebook

In [12]:
# Imports

from treetagger import TreeTagger

In [13]:
# Create Latin tagger

tagger = TreeTagger(language='latin')

In [14]:
%%time

# Tag with treetagger-python

print(f'Tagging {len(text.split())} tokens...')
tags_list = tagger.tag(text)

Tagging 125 tokens...
CPU times: user 5.31 ms, sys: 8.64 ms, total: 14 ms
Wall time: 2.6 s


In [15]:
pprint(tags_list[:10])

[['Omnis', 'PRON', 'omnis'],
 ['homines', 'N:nom', 'homo'],
 [',', 'PUN', ','],
 ['qui', 'REL', 'qui'],
 ['sese', 'PRON', 'sui'],
 ['student', 'V:IND', 'studeo'],
 ['praestare', 'V:INF', 'praesto'],
 ['ceteris', 'ADJ:abl', 'ceterus'],
 ['animalibus', 'N:abl', 'animal|animalis'],
 [',', 'PUN', ',']]


In [16]:
# Make a lemma pair list for treetagger-python output

lemma_pairs_2 = [(token, lemma) for token, _, lemma in tags_list]

In [17]:
# Compare output

for i, pair in enumerate(lemma_pairs):
    if pair != lemma_pairs_2[i]:
        print(pair, lemma_pairs_2[i])

('aeternaque', 'aeternaque') ('aeternaque', '<unknown>')


In [18]:
# Create Latin tagger to return '<unknown>'

tagger = treetaggerwrapper.TreeTagger(TAGLANG='la', TAGOPT='-token -lemma -sgml -quiet')
tags = tagger.tag_text(text)
tags_tuples = treetaggerwrapper.make_tags(tags)
lemma_pairs = [(token, lemma) for token, _, lemma in tags_tuples]
lemma_pairs == lemma_pairs_2

True