## Backoff Tagging

In [1]:
# Default tagger

from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')

print(tagger.tag('Hello World'.split()))

[('Hello', 'NN'), ('World', 'NN')]


In [2]:
# Set up train/test sents

from nltk.corpus import treebank
from nltk.tag import untag


print(f'There are {len(treebank.tagged_sents())} tagged sentences in treebank.')

train_sents = treebank.tagged_sents()[:3000]
example_sent = treebank.sents()[3000]
test_sents = treebank.tagged_sents()[3001:]

sent_slice = slice(10,12)
print(train_sents[sent_slice])
print('\n')

for i, sent in enumerate(train_sents[sent_slice]):
    print(f'Sentence {i+1}:')
#     print(f'{" ".join([word for word, _ in sent])}\n')
    print(f'{" ".join(untag(sent))}\n')

There are 3914 tagged sentences in treebank.
[[('Neither', 'DT'), ('Lorillard', 'NNP'), ('nor', 'CC'), ('the', 'DT'), ('researchers', 'NNS'), ('who', 'WP'), ('*T*-3', '-NONE-'), ('studied', 'VBD'), ('the', 'DT'), ('workers', 'NNS'), ('were', 'VBD'), ('aware', 'JJ'), ('of', 'IN'), ('any', 'DT'), ('research', 'NN'), ('on', 'IN'), ('smokers', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Kent', 'NNP'), ('cigarettes', 'NNS'), ('.', '.')], [('``', '``'), ('We', 'PRP'), ('have', 'VBP'), ('no', 'DT'), ('useful', 'JJ'), ('information', 'NN'), ('on', 'IN'), ('whether', 'IN'), ('users', 'NNS'), ('are', 'VBP'), ('at', 'IN'), ('risk', 'NN'), (',', ','), ("''", "''"), ('said', 'VBD'), ('*T*-1', '-NONE-'), ('James', 'NNP'), ('A.', 'NNP'), ('Talcott', 'NNP'), ('of', 'IN'), ('Boston', 'NNP'), ("'s", 'POS'), ('Dana-Farber', 'NNP'), ('Cancer', 'NNP'), ('Institute', 'NNP'), ('.', '.')]]


Sentence 1:
Neither Lorillard nor the researchers who *T*-3 studied the workers were aware of any research on smokers of the

In [4]:
# Unigram tagger w. training data
# Note that are also bigram, trigram, etc. taggers, but they will not prove to be useful for lemmatization

from nltk.tag import UnigramTagger
tagger = UnigramTagger(train_sents)

print(tagger.tag('Hello World'.split()))

[('Hello', None), ('World', 'NNP')]


In [5]:
print(tagger.tag(example_sent))

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', None), ('index', 'NN'), ('of', 'IN'), ('225', 'CD'), ('selected', None), ('issues', 'NNS'), (',', ','), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBD'), ('132', None), ('points', 'NNS'), ('Tuesday', 'NNP'), (',', ','), ('added', 'VBD'), ('14.99', None), ('points', 'NNS'), ('to', 'TO'), ('35564.43', None), ('.', '.')]


In [6]:
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

Tagger accuracy: 85.72%


In [7]:
# Unigram Tagger w. dictionary

tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'})

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

[('At', None), ('Tokyo', None), (',', None), ('the', None), ('Nikkei', 'NNP'), ('index', None), ('of', None), ('225', None), ('selected', 'VBN'), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', None), ('132', None), ('points', None), ('Tuesday', None), (',', None), ('added', None), ('14.99', None), ('points', None), ('to', None), ('35564.43', None), ('.', None)]


Tagger accuracy: 0.01%


In [8]:
# Backoff tagging

backoff_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=backoff_tagger)

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', 'NN'), ('index', 'NN'), ('of', 'IN'), ('225', 'CD'), ('selected', 'NN'), ('issues', 'NNS'), (',', ','), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBD'), ('132', 'NN'), ('points', 'NNS'), ('Tuesday', 'NNP'), (',', ','), ('added', 'VBD'), ('14.99', 'NN'), ('points', 'NNS'), ('to', 'TO'), ('35564.43', 'NN'), ('.', '.')]


Tagger accuracy: 87.42%


In [9]:
# Regex tagging

from nltk.tag import RegexpTagger

patterns = [
    (r'\b\d+\b', 'CD'),
    (r'\b.+ed\b', 'VBD')
]

tagger = RegexpTagger(patterns)

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

[('At', None), ('Tokyo', None), (',', None), ('the', None), ('Nikkei', None), ('index', None), ('of', None), ('225', 'CD'), ('selected', 'VBD'), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', 'VBD'), ('132', 'CD'), ('points', None), ('Tuesday', None), (',', None), ('added', 'VBD'), ('14.99', 'CD'), ('points', None), ('to', None), ('35564.43', 'CD'), ('.', None)]


Tagger accuracy: 5.36%


In [10]:
# Another backoff chain

# default_tagger = DefaultTagger('NN')
default_tagger = None
train_tagger = UnigramTagger(train_sents, backoff=default_tagger)
dict_tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'}, backoff=train_tagger)
tagger = dict_tagger

from pprint import pprint

pprint(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

[('At', 'IN'),
 ('Tokyo', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NNP'),
 ('index', 'NN'),
 ('of', 'IN'),
 ('225', 'CD'),
 ('selected', 'VBN'),
 ('issues', 'NNS'),
 (',', ','),
 ('which', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('gained', 'VBD'),
 ('132', None),
 ('points', 'NNS'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('added', 'VBD'),
 ('14.99', None),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35564.43', None),
 ('.', '.')]


Tagger accuracy: 85.73%


## Lemmatization as a backoff task

In [11]:
test = """περὶ πολλοῦ ἂν ποιησαίμην ὦ ἄνδρες τὸ τοιούτους ὑμᾶς ἐμοὶ δικαστὰς περὶ τούτου τοῦ πράγματος γενέσθαι οἷοίπερ ἂν ὑμῖν αὐτοῖς εἴητε τοιαῦτα πεπονθότες"""

In [12]:
# Default lemmatizer

from cltk.lemmatize.greek.backoff import DefaultLemmatizer
lemmatizer = DefaultLemmatizer('Unk')

print(lemmatizer.lemmatize(test.split()))

[('περὶ', 'Unk'), ('πολλοῦ', 'Unk'), ('ἂν', 'Unk'), ('ποιησαίμην', 'Unk'), ('ὦ', 'Unk'), ('ἄνδρες', 'Unk'), ('τὸ', 'Unk'), ('τοιούτους', 'Unk'), ('ὑμᾶς', 'Unk'), ('ἐμοὶ', 'Unk'), ('δικαστὰς', 'Unk'), ('περὶ', 'Unk'), ('τούτου', 'Unk'), ('τοῦ', 'Unk'), ('πράγματος', 'Unk'), ('γενέσθαι', 'Unk'), ('οἷοίπερ', 'Unk'), ('ἂν', 'Unk'), ('ὑμῖν', 'Unk'), ('αὐτοῖς', 'Unk'), ('εἴητε', 'Unk'), ('τοιαῦτα', 'Unk'), ('πεπονθότες', 'Unk')]


In [13]:
# Set up train/test sents

import pickle
tagged_sents = pickle.load(open("../data/tagged_sents.p", "rb" ))

print(f'There are {len(tagged_sents)} tagged sentences in treebank.')

from random import Random
Random(4).shuffle(tagged_sents)

train_sents = tagged_sents[1:30000]
example_sent = untag(tagged_sents[0])
test_sents = tagged_sents[30000:]

sent_slice = slice(10,12)
print(train_sents[sent_slice])
print('\n')

for i, sent in enumerate(train_sents[sent_slice]):
    print(f'Sentence {i+1}:')
#     print(f'{" ".join([word for word, _ in sent])}\n')
    print(f'{" ".join(untag(sent))}\n')

There are 33555 tagged sentences in treebank.
[[('ἔνθα', 'ἔνθα'), ('δὲ', 'δέ'), ('πολλαὶ', 'πολύς'), ('ψυχαὶ', 'ψυχή'), ('ἐλεύσονται', 'ἔρχομαι'), ('νεκύων', 'νέκυς'), ('κατατεθνηώτων', 'καταθνήσκω'), ('.', '.')], [('Ἀριστογείτων', 'Ἀριστογείτων'), ('δὲ', 'δέ'), ('ἐν', 'ἐν'), ('τῷ', 'ὁ'), ('κατὰ', 'κατά'), ('Φρύνης', 'φρύνη'), ('τὸ', 'ὁ'), ('κύριόν', 'κύριος'), ('φησιν', 'φημί'), ('αὐτῆς', 'αὐτός'), ('εἶναι', 'εἰμί'), ('ὄνομα', 'ὄνομα'), ('Μνησαρέτην', 'Μνησαρέτην'), ('.', '.')]]


Sentence 1:
ἔνθα δὲ πολλαὶ ψυχαὶ ἐλεύσονται νεκύων κατατεθνηώτων .

Sentence 2:
Ἀριστογείτων δὲ ἐν τῷ κατὰ Φρύνης τὸ κύριόν φησιν αὐτῆς εἶναι ὄνομα Μνησαρέτην .



In [14]:
# Unigram lemmatizer w. training data
# Note that are also bigram, trigram, etc. taggers, but they will not prove to be useful for lemmatization

from cltk.lemmatize.greek.backoff import UnigramLemmatizer
lemmatizer = UnigramLemmatizer(train_sents)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

[('τὸ', 'ὁ'), ('μὲν', 'μέν'), ('αὖτις', 'αὖθις'), ('ἐὺς', 'ἐύς'), ('πάις', 'παῖς'), ('Ἰαπετοῖο', 'Ἰαπέτος'), ('ἔκλεψ̓', None), ('ἀνθρώποισι', 'ἄνθρωπος'), ('Διὸς', 'Ζεύς'), ('πάρα', 'παρά'), ('μητιόεντος', None), ('ἐν', 'ἐν'), ('κοῒλῳ', None), ('νάρθηκι', None), ('λαθὼν', 'λανθάνω'), ('Δία', 'Ζεύς'), ('τερπικέραυνον', 'τερπικέραυνος'), ('.', '.')]


Lemmatizer accuracy: 88.20%


In [16]:
# Unigram lemmatizer w. dictionary

lemmatizer = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'})

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

[('τὸ', None), ('μὲν', None), ('αὖτις', None), ('ἐὺς', None), ('πάις', None), ('Ἰαπετοῖο', None), ('ἔκλεψ̓', None), ('ἀνθρώποισι', None), ('Διὸς', None), ('πάρα', None), ('μητιόεντος', 'μητιόεις'), ('ἐν', None), ('κοῒλῳ', None), ('νάρθηκι', 'νάρθηξ'), ('λαθὼν', None), ('Δία', None), ('τερπικέραυνον', None), ('.', None)]


Lemmatizer accuracy: 0.00%


In [51]:
# Regex lemmatizing

from cltk.lemmatize.greek.backoff import RegexpLemmatizer

greek_sub_patterns = [
('(ό)(εις|εντος|εντι|εντα)$', r'\1εις'),
]
    
lemmatizer = RegexpLemmatizer(greek_sub_patterns)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

[('τὸ', None), ('μὲν', None), ('αὖτις', None), ('ἐὺς', None), ('πάις', None), ('Ἰαπετοῖο', None), ('ἔκλεψ̓', None), ('ἀνθρώποισι', None), ('Διὸς', None), ('πάρα', None), ('μητιόεντος', 'μητιόεις'), ('ἐν', None), ('κοῒλῳ', None), ('νάρθηκι', None), ('λαθὼν', None), ('Δία', None), ('τερπικέραυνον', None), ('.', None)]


Lemmatizer accuracy: 0.06%


In [18]:
# Backoff lemmatizing

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_5 = DefaultLemmatizer('Unknown')
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = UnigramLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = UnigramLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

[('τὸ', 'ὁ'), ('μὲν', 'μέν'), ('αὖτις', 'αὖθις'), ('ἐὺς', 'ἐύς'), ('πάις', 'παῖς'), ('Ἰαπετοῖο', 'Ἰαπέτος'), ('ἔκλεψ̓', 'Unknown'), ('ἀνθρώποισι', 'ἄνθρωπος'), ('Διὸς', 'Ζεύς'), ('πάρα', 'παρά'), ('μητιόεντος', 'μητιόεις'), ('ἐν', 'ἐν'), ('κοῒλῳ', 'Unknown'), ('νάρθηκι', 'νάρθηξ'), ('λαθὼν', 'λανθάνω'), ('Δία', 'Ζεύς'), ('τερπικέραυνον', 'τερπικέραυνος'), ('.', 'punc')]


Lemmatizer accuracy: 76.92%


In [19]:
for key, value in GREEK_MODEL.items():
    if value=='punc':
        GREEK_MODEL[key] = key

In [20]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

for key, value in GREEK_MODEL.items():
    if value=='punc':
        GREEK_MODEL[key] = key

lemmatizer_5 = DefaultLemmatizer('Unknown')
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = UnigramLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = UnigramLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

[('τὸ', 'ὁ'), ('μὲν', 'μέν'), ('αὖτις', 'αὖθις'), ('ἐὺς', 'ἐύς'), ('πάις', 'παῖς'), ('Ἰαπετοῖο', 'Ἰαπέτος'), ('ἔκλεψ̓', 'Unknown'), ('ἀνθρώποισι', 'ἄνθρωπος'), ('Διὸς', 'Ζεύς'), ('πάρα', 'παρά'), ('μητιόεντος', 'μητιόεις'), ('ἐν', 'ἐν'), ('κοῒλῳ', 'Unknown'), ('νάρθηκι', 'νάρθηξ'), ('λαθὼν', 'λανθάνω'), ('Δία', 'Ζεύς'), ('τερπικέραυνον', 'τερπικέραυνος'), ('.', '.')]


Lemmatizer accuracy: 87.71%


## Adding Wrappers to Backoff Lemmatizer

In [21]:
# Create MorpheusWebserviceLemmatizer as subclass of NLTK's Sequential Backoff Tagger

from lxml import etree as ET
from urllib.request import urlopen

import string

import betacode.conv

from nltk.tag.sequential import SequentialBackoffTagger, UnigramTagger
# from cltk.lemmatize.backoff import UnigramLemmatizer, RegexpLemmatizer
from cltk.lemmatize.backoff import RegexpLemmatizer
from nltk.probability import ConditionalFreqDist

class MorpheusWebserviceLemmatizer(SequentialBackoffTagger):
    """"""
    def __init__(self, backoff=None):
        """Setup for MorpheusWebserviceLemmatizer"""
        SequentialBackoffTagger.__init__(self, backoff)       
        
    def choose_tag(self, tokens, index, history):
        """Returns a lemma for the token at a given index
        :param tokens: List of tokens to be lemmatized
        :param index: Int with current token
        :param history: List with tokens that have already been lemmatized
        :return: String, spec. the lemma found at the current index.
        """  
        token = self._prep_token(tokens[index])
        return self._lemmatize(token)
    
    def _lemmatize(self, token):
        with urlopen(f'http://www.perseus.tufts.edu/hopper/xmlmorph?lang=greek&lookup={token}') as f:
            tree = ET.parse(f)
            root = tree.getroot()
        lemmas = root.findall('.//lemma')
        if lemmas:
            return [lemma.text for lemma in lemmas][0]
    
    def _prep_token(self, token):
        punctuation = string.punctuation + '	̓”“‘᾽（）'
        numbers = '0123456789'
        table = str.maketrans({key: None for key in punctuation+numbers})
        return betacode.conv.uni_to_beta(token).translate(table)
    
    def lemmatize(self, tokens, return_all=True):
        return self.tag(tokens)    

In [22]:
# Split UnigramLemmatizer into two more clearly named taggers

class DictionaryLemmatizer(UnigramTagger):
    """Setup for UnigramLemmatizer()"""
    def __init__(self, train=None, model=None, backoff=None, cutoff=0):
        """"""
        UnigramTagger.__init__(self, train=None, model=model, backoff=backoff, cutoff=cutoff)
    
    def lemmatize(self, tokens):
        return self.tag(tokens)
    
    
class TrainLemmatizer(UnigramTagger):
    """Setup for UnigramLemmatizer()"""
    def __init__(self, train=None, model=None, backoff=None, cutoff=0):
        """"""
        UnigramTagger.__init__(self, train=train, model=None, backoff=backoff, cutoff=cutoff)
    

    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()
        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens_, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens_, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                
                # THE IF STATEMENT HERE HAD TO BE REMOVED—OVERLOADING TOKENS VARIABLE???!!!
                # STILL NOT EXACTLY SURE WHY???
                useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()

            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits
    
    def lemmatize(self, tokens):
        return self.tag(tokens)    

In [24]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
# print('\n')
# print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:5]):.2%}')

[('τὸ', 'ὁ'), ('μὲν', 'μέν'), ('αὖτις', 'αὖθις'), ('ἐὺς', 'ἐύς'), ('πάις', 'παῖς'), ('Ἰαπετοῖο', 'Ἰαπέτος'), ('ἔκλεψ̓', 'ἐκλέπω'), ('ἀνθρώποισι', 'ἄνθρωπος'), ('Διὸς', 'Ζεύς'), ('πάρα', 'παρά'), ('μητιόεντος', 'μητιόεις'), ('ἐν', 'ἐν'), ('κοῒλῳ', 'κοῖλος'), ('νάρθηκι', 'νάρθηξ'), ('λαθὼν', 'λανθάνω'), ('Δία', 'Ζεύς'), ('τερπικέραυνον', 'τερπικέραυνος'), ('.', '.')]


In [25]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

# print(example_sent)

print(lemmatizer.lemmatize(example_sent))
# print('\n')
# print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:5]):.2%}')

[('τὸ', 'ὁ'), ('μὲν', 'μέν'), ('αὖτις', 'αὖθις'), ('ἐὺς', 'ἐύς'), ('πάις', 'παῖς'), ('Ἰαπετοῖο', 'Ἰαπέτος'), ('ἔκλεψ̓', 'ἐκλέπω'), ('ἀνθρώποισι', 'ἄνθρωπος'), ('Διὸς', 'Ζεύς'), ('πάρα', 'παρά'), ('μητιόεντος', 'μητιόεις'), ('ἐν', 'ἐν'), ('κοῒλῳ', 'κοῖλος'), ('νάρθηκι', 'νάρθηξ'), ('λαθὼν', 'λανθάνω'), ('Δία', 'Ζεύς'), ('τερπικέραυνον', 'τερπικέραυνος'), ('.', '.')]


In [32]:
iliad24 = """λῦτο δ᾽ ἀγών λαοὶ δὲ θοὰς ἐπὶ νῆας ἕκαστοι
ἐσκίδναντ᾽ ἰέναι"""

In [33]:
pprint(lemmatizer.lemmatize(iliad24.split()))

[('λῦτο', 'λύω'),
 ('δ᾽', 'δέ'),
 ('ἀγών', 'ἀγών'),
 ('λαοὶ', 'λαός'),
 ('δὲ', 'δέ'),
 ('θοὰς', 'θοός'),
 ('ἐπὶ', 'ἐπί'),
 ('νῆας', 'ναῦς'),
 ('ἕκαστοι', 'ἕκαστος'),
 ('ἐσκίδναντ᾽', 'σκίδνημι'),
 ('ἰέναι', 'εἶμι')]


In [34]:
republic1 = """κατέβην χθὲς εἰς Πειραιᾶ μετὰ Γλαύκωνος τοῦ Ἀρίστωνος προσευξόμενός τε τῇ θεῷ καὶ ἅμα τὴν ἑορτὴν βουλόμενος θεάσασθαι τίνα τρόπον ποιήσουσιν ἅτε νῦν πρῶτον ἄγοντες"""

In [35]:
pprint(lemmatizer.lemmatize(republic1.split()))

[('κατέβην', 'καταβαίνω'),
 ('χθὲς', 'χθές'),
 ('εἰς', 'εἰμί'),
 ('Πειραιᾶ', 'Πειραιεύς'),
 ('μετὰ', 'μετά'),
 ('Γλαύκωνος', 'Unknown'),
 ('τοῦ', 'ὁ'),
 ('Ἀρίστωνος', 'Ἀρίστων'),
 ('προσευξόμενός', 'προσεύχομαι'),
 ('τε', 'τε'),
 ('τῇ', 'ὁ'),
 ('θεῷ', 'θεός'),
 ('καὶ', 'καί'),
 ('ἅμα', 'ἅμα'),
 ('τὴν', 'ὁ'),
 ('ἑορτὴν', 'ἑορτή'),
 ('βουλόμενος', 'βούλομαι'),
 ('θεάσασθαι', 'θεάομαι'),
 ('τίνα', 'τις'),
 ('τρόπον', 'τρόπος'),
 ('ποιήσουσιν', 'ποιέω'),
 ('ἅτε', 'ἅτε'),
 ('νῦν', 'νῦν'),
 ('πρῶτον', 'πρῶτος'),
 ('ἄγοντες', 'ἄγω')]


In [36]:
mark_1_14 = """Καὶ μετὰ τὸ παραδοθῆναι τὸν Ἰωάνην ἦλθεν ὁ Ἰησοῦς εἰς τὴν Γαλιλαίαν κηρύσσων τὸ εὐαγγέλιον τοῦ θεοῦ καὶ λέγων ὅτι Πεπλήρωται ὁ καιρὸς καὶ ἤγγικεν ἡ βασιλεία τοῦ θεοῦ μετανοεῖτε καὶ πιστεύετε ἐν τῷ εὐαγγελίῳ"""

In [37]:
pprint(lemmatizer.lemmatize(mark_1_14.split()))

[('Καὶ', 'καί'),
 ('μετὰ', 'μετά'),
 ('τὸ', 'ὁ'),
 ('παραδοθῆναι', 'παραδίδωμι'),
 ('τὸν', 'ὁ'),
 ('Ἰωάνην', 'Unknown'),
 ('ἦλθεν', 'ἔρχομαι'),
 ('ὁ', 'ὁ'),
 ('Ἰησοῦς', 'Ἰησοῦς'),
 ('εἰς', 'εἰμί'),
 ('τὴν', 'ὁ'),
 ('Γαλιλαίαν', 'Unknown'),
 ('κηρύσσων', 'κηρύσσω'),
 ('τὸ', 'ὁ'),
 ('εὐαγγέλιον', 'εὐαγγέλιον'),
 ('τοῦ', 'ὁ'),
 ('θεοῦ', 'θεός'),
 ('καὶ', 'καί'),
 ('λέγων', 'λέγω'),
 ('ὅτι', 'ὅτι'),
 ('Πεπλήρωται', 'πληρόω'),
 ('ὁ', 'ὁ'),
 ('καιρὸς', 'καιρός'),
 ('καὶ', 'καί'),
 ('ἤγγικεν', 'Unknown'),
 ('ἡ', 'ὁ'),
 ('βασιλεία', 'βασιλεία'),
 ('τοῦ', 'ὁ'),
 ('θεοῦ', 'θεός'),
 ('μετανοεῖτε', 'μετανοέω'),
 ('καὶ', 'καί'),
 ('πιστεύετε', 'πιστεύω'),
 ('ἐν', 'ἐν'),
 ('τῷ', 'ὁ'),
 ('εὐαγγελίῳ', 'εὐαγγέλιον')]


In [58]:
# Backoff lemmatizing; customize chain for Plato

from cltk.lemmatize.greek.greek_model import GREEK_MODEL
custom_dict = {'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}

greek_sub_patterns.append(('(ων)(ος|ι|α)$', r'\1'))

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model=custom_dict, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

In [55]:
pprint(lemmatizer.lemmatize(republic1.split()))

[('κατέβην', 'καταβαίνω'),
 ('χθὲς', 'χθές'),
 ('εἰς', 'εἰμί'),
 ('Πειραιᾶ', 'Πειραιεύς'),
 ('μετὰ', 'μετά'),
 ('Γλαύκωνος', 'Γλαύκων'),
 ('τοῦ', 'ὁ'),
 ('Ἀρίστωνος', 'Ἀρίστων'),
 ('προσευξόμενός', 'προσεύχομαι'),
 ('τε', 'τε'),
 ('τῇ', 'ὁ'),
 ('θεῷ', 'θεός'),
 ('καὶ', 'καί'),
 ('ἅμα', 'ἅμα'),
 ('τὴν', 'ὁ'),
 ('ἑορτὴν', 'ἑορτή'),
 ('βουλόμενος', 'βούλομαι'),
 ('θεάσασθαι', 'θεάομαι'),
 ('τίνα', 'τις'),
 ('τρόπον', 'τρόπος'),
 ('ποιήσουσιν', 'ποιέω'),
 ('ἅτε', 'ἅτε'),
 ('νῦν', 'νῦν'),
 ('πρῶτον', 'πρῶτος'),
 ('ἄγοντες', 'ἄγω')]


In [62]:
# Backoff lemmatizing; customize chain for Plato

from cltk.lemmatize.greek.greek_model import GREEK_MODEL
custom_dict = {'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}
custom_dict.update({'ἤγγικεν': 'ἐγγιζω'})

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model=custom_dict, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

In [63]:
pprint(lemmatizer.lemmatize(mark_1_14.split()))

[('Καὶ', 'καί'),
 ('μετὰ', 'μετά'),
 ('τὸ', 'ὁ'),
 ('παραδοθῆναι', 'παραδίδωμι'),
 ('τὸν', 'ὁ'),
 ('Ἰωάνην', 'Unknown'),
 ('ἦλθεν', 'ἔρχομαι'),
 ('ὁ', 'ὁ'),
 ('Ἰησοῦς', 'Ἰησοῦς'),
 ('εἰς', 'εἰμί'),
 ('τὴν', 'ὁ'),
 ('Γαλιλαίαν', 'Unknown'),
 ('κηρύσσων', 'κηρύσσω'),
 ('τὸ', 'ὁ'),
 ('εὐαγγέλιον', 'εὐαγγέλιον'),
 ('τοῦ', 'ὁ'),
 ('θεοῦ', 'θεός'),
 ('καὶ', 'καί'),
 ('λέγων', 'λέγω'),
 ('ὅτι', 'ὅτι'),
 ('Πεπλήρωται', 'πληρόω'),
 ('ὁ', 'ὁ'),
 ('καιρὸς', 'καιρός'),
 ('καὶ', 'καί'),
 ('ἤγγικεν', 'ἐγγιζω'),
 ('ἡ', 'ὁ'),
 ('βασιλεία', 'βασιλεία'),
 ('τοῦ', 'ὁ'),
 ('θεοῦ', 'θεός'),
 ('μετανοεῖτε', 'μετανοέω'),
 ('καὶ', 'καί'),
 ('πιστεύετε', 'πιστεύω'),
 ('ἐν', 'ἐν'),
 ('τῷ', 'ὁ'),
 ('εὐαγγελίῳ', 'εὐαγγέλιον')]


In [68]:
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:250]):.2%}')

Lemmatizer accuracy: 94.80%
