# Backoff Lemmatization for Ancient Greek with the Classical Language Toolkit
### Patrick J. Burns, Institute for the Study of the Ancient World

[diyclassics.github.io](diyclassics.github.io) | [@diyclassics](twitter.com/diyclassics)

An iPython notebook written to introduce seminar participants to the code behind backoff tagging, backoff lemmatization, and related matters.

Presented at the Institute for Classical Studies as part of the Digital Classics London Summer Seminar on 7.27.18.

Main repo at: [https://github.com/diyclassics/dcl-2018-talk](https://github.com/diyclassics/dcl-2018-talk)

Last updated 7.27.18.

## Preliminaries

Necessary Python packages (not necessary for working in Binder):

- pip install jupyter
- pip install -e git+https://github.com/diyclassics/cltk.git@lemma-update#egg=cltk (see https://pip.readthedocs.io/en/1.1/usage.html#version-control-systems on install packages from forks/branches)

Necessary NLTK corpora
- tagged_sents (installed below)

## Installation

In [None]:
import nltk
nltk.download('treebank')

## Backoff Tagging

In [None]:
# Default tagger

from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')

print(tagger.tag('Hello World'.split()))

In [None]:
# Set up train/test sents

from nltk.corpus import treebank
from nltk.tag import untag


print(f'There are {len(treebank.tagged_sents())} tagged sentences in treebank.')

train_sents = treebank.tagged_sents()[:3000]
example_sent = treebank.sents()[3000]
test_sents = treebank.tagged_sents()[3001:]

sent_slice = slice(10,12)
print(train_sents[sent_slice])
print('\n')

for i, sent in enumerate(train_sents[sent_slice]):
    print(f'Sentence {i+1}:')
#     print(f'{" ".join([word for word, _ in sent])}\n')
    print(f'{" ".join(untag(sent))}\n')

In [None]:
# Unigram tagger w. training data
# Note that are also bigram, trigram, etc. taggers, but they will not prove to be useful for lemmatization

from nltk.tag import UnigramTagger
tagger = UnigramTagger(train_sents)

print(tagger.tag('Hello World'.split()))

In [None]:
print(tagger.tag(example_sent))

In [None]:
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

In [None]:
# Unigram Tagger w. dictionary

tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'})

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

In [None]:
# Backoff tagging

backoff_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=backoff_tagger)

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

In [None]:
# Regex tagging

from nltk.tag import RegexpTagger

patterns = [
    (r'\b\d+\b', 'CD'),
    (r'\b.+ed\b', 'VBD')
]

tagger = RegexpTagger(patterns)

print(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

In [None]:
# Another backoff chain

# default_tagger = DefaultTagger('NN')
default_tagger = None
train_tagger = UnigramTagger(train_sents, backoff=default_tagger)
dict_tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'}, backoff=train_tagger)
tagger = dict_tagger

from pprint import pprint

pprint(tagger.tag(example_sent))
print('\n')
print(f'Tagger accuracy: {tagger.evaluate(test_sents):.2%}')

## Lemmatization as a backoff task

In [None]:
# Test selection; Lysias 1.1

test = """περὶ πολλοῦ ἂν ποιησαίμην ὦ ἄνδρες τὸ τοιούτους ὑμᾶς ἐμοὶ δικαστὰς περὶ τούτου τοῦ πράγματος γενέσθαι οἷοίπερ ἂν ὑμῖν αὐτοῖς εἴητε τοιαῦτα πεπονθότες"""

In [None]:
# Default lemmatizer

from cltk.lemmatize.greek.backoff import DefaultLemmatizer
lemmatizer = DefaultLemmatizer('Unk')

print(lemmatizer.lemmatize(test.split()))

In [None]:
# Set up train/test sents

import pickle
tagged_sents = pickle.load(open("../data/tagged_sents.p", "rb" ))

print(f'There are {len(tagged_sents)} tagged sentences in treebank.')

from random import Random
Random(4).shuffle(tagged_sents)

train_sents = tagged_sents[1:30000]
example_sent = untag(tagged_sents[0])
test_sents = tagged_sents[30000:]

sent_slice = slice(10,12)
print(train_sents[sent_slice])
print('\n')

for i, sent in enumerate(train_sents[sent_slice]):
    print(f'Sentence {i+1}:')
#     print(f'{" ".join([word for word, _ in sent])}\n')
    print(f'{" ".join(untag(sent))}\n')

In [None]:
# Unigram lemmatizer w. training data
# Note that are also bigram, trigram, etc. taggers, but they will not prove to be useful for lemmatization

from cltk.lemmatize.greek.backoff import UnigramLemmatizer
lemmatizer = UnigramLemmatizer(train_sents)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

In [None]:
# Unigram lemmatizer w. dictionary

lemmatizer = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'})

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

In [None]:
# Regex lemmatizing

from cltk.lemmatize.greek.backoff import RegexpLemmatizer

greek_sub_patterns = [
    ('(ό)(εις|εντος|εντι|εντα)$', r'\1εις'),
]
    
lemmatizer = RegexpLemmatizer(greek_sub_patterns)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

In [None]:
# Backoff lemmatizing

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_5 = DefaultLemmatizer('Unknown')
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = UnigramLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer   = UnigramLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

In [None]:
# Normalize punctuation

for key, value in GREEK_MODEL.items():
    if value=='punc':
        GREEK_MODEL[key] = key

In [None]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

for key, value in GREEK_MODEL.items():
    if value=='punc':
        GREEK_MODEL[key] = key

lemmatizer_5 = DefaultLemmatizer('Unknown')
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = UnigramLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = UnigramLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = UnigramLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
print('\n')
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents):.2%}')

## Adding Wrappers to Backoff Lemmatizer

In [None]:
# Create MorpheusWebserviceLemmatizer as subclass of NLTK's Sequential Backoff Tagger

from lxml import etree as ET
from urllib.request import urlopen

import string

import betacode.conv

from nltk.tag.sequential import SequentialBackoffTagger, UnigramTagger
# from cltk.lemmatize.backoff import UnigramLemmatizer, RegexpLemmatizer
from cltk.lemmatize.backoff import RegexpLemmatizer
from nltk.probability import ConditionalFreqDist

class MorpheusWebserviceLemmatizer(SequentialBackoffTagger):
    """"""
    def __init__(self, backoff=None):
        """Setup for MorpheusWebserviceLemmatizer"""
        SequentialBackoffTagger.__init__(self, backoff)       
        
    def choose_tag(self, tokens, index, history):
        """Returns a lemma for the token at a given index
        :param tokens: List of tokens to be lemmatized
        :param index: Int with current token
        :param history: List with tokens that have already been lemmatized
        :return: String, spec. the lemma found at the current index.
        """  
        token = self._prep_token(tokens[index])
        return self._lemmatize(token)
    
    def _lemmatize(self, token):
        with urlopen(f'http://www.perseus.tufts.edu/hopper/xmlmorph?lang=greek&lookup={token}') as f:
            tree = ET.parse(f)
            root = tree.getroot()
        lemmas = root.findall('.//lemma')
        if lemmas:
            return [lemma.text for lemma in lemmas][0]
    
    def _prep_token(self, token):
        punctuation = string.punctuation + '	̓”“‘᾽（）'
        numbers = '0123456789'
        table = str.maketrans({key: None for key in punctuation+numbers})
        return betacode.conv.uni_to_beta(token).translate(table)
    
    def lemmatize(self, tokens, return_all=True):
        return self.tag(tokens)    

In [None]:
# Split UnigramLemmatizer into two more clearly named taggers

class DictionaryLemmatizer(UnigramTagger):
    """Setup for UnigramLemmatizer()"""
    def __init__(self, train=None, model=None, backoff=None, cutoff=0):
        """"""
        UnigramTagger.__init__(self, train=None, model=model, backoff=backoff, cutoff=cutoff)
    
    def lemmatize(self, tokens):
        return self.tag(tokens)
    
    
class TrainLemmatizer(UnigramTagger):
    """Setup for UnigramLemmatizer()"""
    def __init__(self, train=None, model=None, backoff=None, cutoff=0):
        """"""
        UnigramTagger.__init__(self, train=train, model=None, backoff=backoff, cutoff=cutoff)
    

    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()
        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens_, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens_, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                
                # THE IF STATEMENT HERE HAD TO BE REMOVED—OVERLOADING TOKENS VARIABLE???!!!
                # STILL NOT EXACTLY SURE WHY???
                useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()

            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits
    
    def lemmatize(self, tokens):
        return self.tag(tokens)    

In [None]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer   = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

print(lemmatizer.lemmatize(example_sent))
# print('\n')
# print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:5]):.2%}')

In [None]:
# Backoff lemmatizing; importance of model definition

from cltk.lemmatize.greek.greek_model import GREEK_MODEL

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model={'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer   = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

# print(example_sent)

print(lemmatizer.lemmatize(example_sent))
# print('\n')
# print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:5]):.2%}')

In [None]:
iliad24 = """λῦτο δ᾽ ἀγών λαοὶ δὲ θοὰς ἐπὶ νῆας ἕκαστοι
ἐσκίδναντ᾽ ἰέναι"""

In [None]:
pprint(lemmatizer.lemmatize(iliad24.split()))

In [None]:
republic1 = """κατέβην χθὲς εἰς Πειραιᾶ μετὰ Γλαύκωνος τοῦ Ἀρίστωνος προσευξόμενός τε τῇ θεῷ καὶ ἅμα τὴν ἑορτὴν βουλόμενος θεάσασθαι τίνα τρόπον ποιήσουσιν ἅτε νῦν πρῶτον ἄγοντες"""

In [None]:
pprint(lemmatizer.lemmatize(republic1.split()))

In [None]:
mark_1_14 = """Καὶ μετὰ τὸ παραδοθῆναι τὸν Ἰωάνην ἦλθεν ὁ Ἰησοῦς εἰς τὴν Γαλιλαίαν κηρύσσων τὸ εὐαγγέλιον τοῦ θεοῦ καὶ λέγων ὅτι Πεπλήρωται ὁ καιρὸς καὶ ἤγγικεν ἡ βασιλεία τοῦ θεοῦ μετανοεῖτε καὶ πιστεύετε ἐν τῷ εὐαγγελίῳ"""

In [None]:
pprint(lemmatizer.lemmatize(mark_1_14.split()))

In [None]:
# Backoff lemmatizing; customize chain for Plato

from cltk.lemmatize.greek.greek_model import GREEK_MODEL
custom_dict = {'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}

greek_sub_patterns.append(('(ων)(ος|ι|α)$', r'\1'))

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model=custom_dict, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

In [None]:
pprint(lemmatizer.lemmatize(republic1.split()))

In [None]:
# Backoff lemmatizing; customize chain for Plato

from cltk.lemmatize.greek.greek_model import GREEK_MODEL
custom_dict = {'μητιόεντος': 'μητιόεις', 'νάρθηκι': 'νάρθηξ'}
custom_dict.update({'ἤγγικεν': 'ἐγγιζω'})

lemmatizer_6 = DefaultLemmatizer('Unknown')
lemmatizer_5 = MorpheusWebserviceLemmatizer(backoff=lemmatizer_6)
lemmatizer_4 = RegexpLemmatizer(greek_sub_patterns, backoff=lemmatizer_5)
lemmatizer_3 = DictionaryLemmatizer(model=custom_dict, backoff=lemmatizer_4)
lemmatizer_2 = TrainLemmatizer(train_sents, backoff=lemmatizer_3)
lemmatizer = DictionaryLemmatizer(model=GREEK_MODEL, backoff=lemmatizer_2)

In [None]:
pprint(lemmatizer.lemmatize(mark_1_14.split()))

In [None]:
print(f'Lemmatizer accuracy: {lemmatizer.evaluate(test_sents[:250]):.2%}')