In [1]:
# Imports

from nltk.tag.sequential import SequentialBackoffTagger

from cltk.tokenize.word import WordTokenizer

from treetagger import TreeTagger

from pprint import pprint

In [2]:
tokenizer = WordTokenizer('latin')

In [27]:
class TreeTaggerLemmatizer(SequentialBackoffTagger):
    """"""
    def __init__(self, backoff=None):
        """Setup for TreeTaggerLemmatizer()."""
        
        SequentialBackoffTagger.__init__(self, backoff)
        self.tagger = TreeTagger(language='latin') # Error trap to see if module is installed!
        
        
    def choose_tag(self, tokens, index, history):
        """Returns the lemma tagged in lemmatize by TreeTagger.
        :param tokens: List of tokens to be lemmatized
        :param index: Int with current token
        :param history: List with tokens that have already been lemmatized
        :return: String, spec. the token found at the current index.
        """
        print(f'Choosing {tokens[index]}')
        if tokens[index]:
            return tokens[index]    
    
    
    def tag(self, tokens):
        """
        Backoff Lemmatizer wrapper for TreeTagger
        # Note: only takes the first match (for now!) in returned lemma, 
        # i.e. 'vir|virum|virus|vis' is truncated to 'vir'. Not ideal.
        :param tokens: List of tokens to be lemmatized
        :return: Tuple of the form (TOKEN, LEMMA)
        """
        lemmas = []
        text = " ".join(tokens)
        lemmas = []
        for _, _, lemma in self.tagger.tag(text):
            if lemma == '<unknown>':
                lemmas.append(None)
            else:
                lemmas.append(lemma.split('|')[0])
        print(list(zip(tokens,lemmas)))
        return list(zip(tokens, lemmas))
    
#     def lemmatize(self, tokens):
#         """
#         Backoff Lemmatizer wrapper for TreeTagger
#         # Note: only takes the first match (for now!) in returned lemma, 
#         # i.e. 'vir|virum|virus|vis' is truncated to 'vir'. Not ideal.
#         :param tokens: List of tokens to be lemmatized
#         :return: Tuple of the form (TOKEN, LEMMA)
#         """
#         lemmas = []
#         text = " ".join(tokens)
#         lemmas = []
#         for _, _, lemma in self.tagger.tag(text):
#             if lemma == '<unknown>':
#                 lemmas.append(None)
#             else:
#                 lemmas.append(lemma.split('|')[0])
#         print(list(zip(tokens,lemmas)))
#         return list(zip(tokens, lemmas))

    def lemmatize(self, tokens):
        return self.tag(tokens)

In [4]:
t = TreeTaggerLemmatizer()

In [5]:
text = """Omnis homines qui sese student praestare ceteris animalibus, summa ope niti decet, ne vitam silentio transeant veluti pecora, quae natura prona atque ventri oboedientia finxit. Sed nostra omnis vis in animo et corpore sita est: animi imperio, corporis servitio magis utimur; alterum nobis cum dis, alterum cum beluis commune est. Quo mihi rectius videtur ingeni quam virium opibus gloriam quaerere et, quoniam vita ipsa, qua fruimur, brevis est, memoriam nostri quam maxume longam efficere. Nam divitiarum et formae gloria fluxa atque fragilis est, virtus clara aeternaque habetur. Sed diu magnum inter mortalis certamen fuit, vine corporis an virtute animi res militaris magis procederet. Nam et, prius quam incipias, consulto et, ubi consulueris, mature facto opus est. Ita utrumque per se indigens alterum alterius auxilio eget.
"""

In [6]:
%%time
lemma_pairs = t.lemmatize(tokenizer.tokenize(text))

[('Omnis', 'omnis'), ('homines', 'homo'), ('qui', 'qui'), ('sese', 'sui'), ('student', 'studeo'), ('praestare', 'praesto'), ('ceteris', 'ceterus'), ('animalibus', 'animal'), (',', ','), ('summa', 'summus'), ('ope', 'ops'), ('niti', 'nitor'), ('decet', 'decet'), (',', ','), ('ne', 'ne'), ('vitam', 'vita'), ('silentio', 'silentium'), ('transeant', 'transeo'), ('veluti', 'veluti'), ('pecora', 'pecus'), (',', ','), ('quae', 'qui'), ('natura', 'natura'), ('prona', 'pronus'), ('atque', 'atque'), ('ventri', 'venter'), ('oboedientia', 'oboedio'), ('finxit', 'fingo'), ('.', '.'), ('Sed', 'sed'), ('nostra', 'noster'), ('omnis', 'omnis'), ('vis', 'vis'), ('in', 'in'), ('animo', 'animus'), ('et', 'et'), ('corpore', 'corpus'), ('sita', 'sino'), ('est', 'sum'), (':', ':'), ('animi', 'animus'), ('imperio', 'imperium'), (',', ','), ('corporis', 'corpus'), ('servitio', 'servitium'), ('magis', 'magis'), ('utimur', 'utor'), (';', ';'), ('alterum', 'alter'), ('nobis', 'nos'), ('cum', 'cum'), ('dis', 'deus

In [7]:
pprint(lemma_pairs[:10])

[('Omnis', 'omnis'),
 ('homines', 'homo'),
 ('qui', 'qui'),
 ('sese', 'sui'),
 ('student', 'studeo'),
 ('praestare', 'praesto'),
 ('ceteris', 'ceterus'),
 ('animalibus', 'animal'),
 (',', ','),
 ('summa', 'summus')]


In [8]:
# text = """ Est brilgum: tovi slimici
# In vabo tererotitant
# Brogovi sunt macresculi
# Momi rasti strugitant.

# "Fuge Gabrobocchia, fili mi,
# Qui fero lacerat morsu:
# Diffide Iubiubae avi
# Es procul ab Unguimanu."""

text = """macresculi
Momi rasti strugitant.

"Fuge Gabrobocchia, fili mi,
Qui fero lacerat morsu:
Diffide Iubiubae avi
Es procul ab Unguimanu."""


In [9]:
lemma_pairs = t.lemmatize(tokenizer.tokenize(text))
pprint(lemma_pairs[:10])

[('macresculi', None), ('Momi', None), ('rasti', None), ('strugitant', None), ('.', '.'), ('"', '"'), ('Fuge', 'fugio'), ('Gabrobocchia', None), (',', ','), ('fili', 'filius'), ('mi', 'meus'), (',', ','), ('Qui', None), ('fero', 'ferus'), ('lacerat', 'lacero'), ('morsu', 'morsus'), (':', ':'), ('Diffide', 'diffido'), ('Iubiubae', None), ('avi', 'avis'), ('Es', 'edo'), ('procul', 'procul'), ('ab', 'ab'), ('Unguimanu', None), ('.', '.')]
[('macresculi', None),
 ('Momi', None),
 ('rasti', None),
 ('strugitant', None),
 ('.', '.'),
 ('"', '"'),
 ('Fuge', 'fugio'),
 ('Gabrobocchia', None),
 (',', ','),
 ('fili', 'filius')]


In [28]:
from cltk.lemmatize.backoff import UnigramLemmatizer, RegexpLemmatizer
from cltk.lemmatize.latin.latin import latin_sub_patterns

# u = UnigramLemmatizer(model={'rastus': 'rasti'})
backoff = RegexpLemmatizer(latin_sub_patterns, backoff=None)
lemmatizer = TreeTaggerLemmatizer(backoff=backoff)

In [29]:
lemma_pairs = lemmatizer.lemmatize(tokenizer.tokenize(text))
pprint(lemma_pairs)

[('macresculi', None), ('Momi', None), ('rasti', None), ('strugitant', None), ('.', '.'), ('"', '"'), ('Fuge', 'fugio'), ('Gabrobocchia', None), (',', ','), ('fili', 'filius'), ('mi', 'meus'), (',', ','), ('Qui', None), ('fero', 'ferus'), ('lacerat', 'lacero'), ('morsu', 'morsus'), (':', ':'), ('Diffide', 'diffido'), ('Iubiubae', None), ('avi', 'avis'), ('Es', 'edo'), ('procul', 'procul'), ('ab', 'ab'), ('Unguimanu', None), ('.', '.')]
[('macresculi', None),
 ('Momi', None),
 ('rasti', None),
 ('strugitant', None),
 ('.', '.'),
 ('"', '"'),
 ('Fuge', 'fugio'),
 ('Gabrobocchia', None),
 (',', ','),
 ('fili', 'filius'),
 ('mi', 'meus'),
 (',', ','),
 ('Qui', None),
 ('fero', 'ferus'),
 ('lacerat', 'lacero'),
 ('morsu', 'morsus'),
 (':', ':'),
 ('Diffide', 'diffido'),
 ('Iubiubae', None),
 ('avi', 'avis'),
 ('Es', 'edo'),
 ('procul', 'procul'),
 ('ab', 'ab'),
 ('Unguimanu', None),
 ('.', '.')]


In [30]:
tagger1 = RegexpLemmatizer(latin_sub_patterns, backoff=None)
tagger2 = TreeTaggerLemmatizer(backoff=tagger1)
print(tagger1._taggers == [tagger1])

True


In [31]:
print(tagger2._taggers == [tagger2, tagger1])

True


In [32]:
print(tagger1._taggers)
print(tagger2._taggers)

[<Regexp Tagger: size=18>]
[<__main__.TreeTaggerLemmatizer object at 0x10a210390>, <Regexp Tagger: size=18>]


In [33]:
print(tagger1)
print(tagger2)

<Regexp Tagger: size=18>
<__main__.TreeTaggerLemmatizer object at 0x10a210390>
