Skip to content

Commit

Permalink
Middle High German lemmatizer (#947)
Browse files Browse the repository at this point in the history
Merging because the outstanding requests appear to have been fulfilled. We'll PR patch if something was missed.

* Initiated lemmatization backoff for Middle High German

* Added corpus

* Added corpus and new lemmatizer

* Added lemmatizer test

* Removed useless import

* Removed useless imports and comments

* Removed useless import

* Added corpus import in setUp

* Updated docs for Middle High German lemmatizer

* Removed training part from sentences which is not used

* Removed useless statements

* Fixed docs

Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
  • Loading branch information
clemsciences and todd-cook committed Jun 7, 2020
1 parent cd920c5 commit e7ba658
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 7 deletions.
15 changes: 9 additions & 6 deletions cltk/corpus/middle_high_german/corpora.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
MIDDLE_HIGH_GERMAN_CORPORA = [{
'name': 'middle_high_german_models_cltk',
'origin': 'https://github.com/cltk/middle_high_german_models_cltk.git',
'location': 'remote',
'type': 'model'
}]

MIDDLE_HIGH_GERMAN_CORPORA = [
{
'name': 'middle_high_german_models_cltk',
'origin': 'https://github.com/cltk/middle_high_german_models_cltk.git',
'location': 'remote',
'type': 'model'
},
]
1 change: 1 addition & 0 deletions cltk/corpus/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
TODO: Fix so ``import_corpora()`` can take relative path.
TODO: Add https://github.com/cltk/pos_latin
"""

from cltk.corpus.akkadian.corpora import AKKADIAN_CORPORA
from cltk.corpus.arabic.corpora import ARABIC_CORPORA
from cltk.corpus.chinese.corpora import CHINESE_CORPORA
Expand Down
Empty file.
53 changes: 53 additions & 0 deletions cltk/lemmatize/middle_high_german/backoff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Module for lemmatizing Middle High German
"""

__author__ = ['Clément Besnier <clemsciences@aol.com>', ]
__license__ = 'MIT License. See LICENSE.'

import os
from typing import List

from cltk.lemmatize.backoff import IdentityLemmatizer, DictLemmatizer

from cltk.utils.file_operations import open_pickle


class BackoffMHGLemmatizer:
"""Suggested backoff chain; includes at least on of each
type of major sequential backoff class from backoff.py
"""

models_path = os.path.normpath(get_cltk_data_dir() +
'/middle_high_german/model/middle_high_german_models_cltk/lemmata/backoff')

def __init__(self, seed: int = 3, verbose: bool = False):
self.models_path = BackoffMHGLemmatizer.models_path

missing_models_message = "BackoffMHGLemmatizer requires the ```middle_high_german_models_cltk``` " \
"to be in cltk_data. Please load this corpus."
self.seed = seed
self.verbose = verbose

self.token_to_lemmata = []
self.lemma_to_tokens = []

try:
self.token_to_lemmata = open_pickle(os.path.join(self.models_path, "token_to_lemma.pickle"))
except FileNotFoundError as err:
raise type(err)(missing_models_message)

self._define_lemmatizer()

def _define_lemmatizer(self):
self.backoff0 = None
self.backoff1 = IdentityLemmatizer(verbose=self.verbose)
self.backoff2 = DictLemmatizer(lemmas=self.token_to_lemmata, source='ReferenzKorpus Mittelhochdeutsch Lemmata',
backoff=self.backoff1, verbose=self.verbose)
self.lemmatizer = self.backoff2

def lemmatize(self, tokens: List[str]):
lemmas = self.lemmatizer.lemmatize(tokens)
return lemmas

def __repr__(self):
return f'<BackoffMHGLemmatizer v0.1>'
12 changes: 11 additions & 1 deletion cltk/tests/test_languages/test_middle_high_german.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import os
import unittest
import unicodedata
import os

from cltk.corpus.middle_high_german.alphabet import normalize_middle_high_german
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.middle_high_german.alphabet import normalize_middle_high_german
from cltk.lemmatize.middle_high_german.backoff import BackoffMHGLemmatizer
from cltk.stem.middle_high_german.stem import stemmer_middle_high_german as middle_high_german_stemmer
from cltk.stop.middle_high_german.stops import STOPS_LIST as MIDDLE_HIGH_GERMAN_STOPS
from cltk.phonology.middle_high_german import transcription as mhg
Expand Down Expand Up @@ -144,6 +146,14 @@ def test_middle_high_german_syllabification(self):

self.assertEqual(syllabified, target)

def test_lemmatizer(self):
mhg_lemmatizer = BackoffMHGLemmatizer()
lemmatized_sentence = mhg_lemmatizer.lemmatize("uns ist in alten mæren".split(" "))
res = [lemmata[1] for lemmata in lemmatized_sentence]
target = ["wir", "sîn", "in", "alt", "mære"]
for lemma_target, lemma_estimated in zip(target, res):
self.assertIn(lemma_target, lemma_estimated)

def test_middle_high_german_tnt_pos_tagger(self):
target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'ADJA'),
('wunders', 'NA'), ('vil', 'AVD'), ('geseit', 'VVPP')]
Expand Down
21 changes: 21 additions & 0 deletions docs/middle_high_german.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,31 @@ The ``WordTokenizer`` class takes a string as input and returns a list of tokens
Out[4]: ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', '', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', '', 'mir', 'hiute', 'und', 'iemer', '', 'tuot', 'wol', '.']
Lemmatization
=============

The CLTK offers a series of lemmatizers that can be combined in a backoff chain, i.e. if one lemmatizer is unable to return a headword for a token, this token can be passed onto another lemmatizer until either a headword is returned or the sequence ends.
There is a generic version of the backoff Middle High German lemmatizer which requires data from `the CLTK Middle High German models data found here <https://github.com/cltk/middle_high_german_models_cltk/tree/master/lemmata/backoff>`_. The lemmatizer expects this model to be stored in a folder called cltk_data in the user's home directory.

To use the generic version of the backoff Middle High German Lemmatizer:

.. code-block:: python
In [1]: from cltk.lemmatize.middle_high_german.backoff import BackoffMHGLemmatizer
In [2]: lemmatizer = BackoffMHGLemmatizer()
In [3]: tokens = "uns ist in alten mæren".split(" ")
In [4]: lemmatizer.lemmatize(tokens)
Out[4]: [('uns', {'uns', 'unser', 'unz', 'wir'}), ('ist', {'sîn/wider(e)+', 'ist', 'sîn/inne+', 'sîn/mit(e)<+', 'sîn/vür(e)+', 'sîn/abe+', 'sîn/obe+', 'sîn/vor(e)+', 'sîn/vür(e)>+', 'sîn/ûze+', 'sîn/ûz+', 'sîn/bî<.+', 'sîn/vür(e)<+', 'sîn/innen+', 'sîn/âne+', 'sîn/bî+', 'sîn/ûz<+', 'sîn', 'sîn/ûf<.+'}), ('in', {'ër', 'in/hin(e)+', 'in/>+gân', 'in/+gân', 'în/+gân', 'in/+lâzen', 'în', 'in/<.+wintel(e)n', 'in/>+rinnen', 'in/dar(e)+', 'in/.>+slîzen', 'în/hin(e)+', 'în/+lèiten', 'în/+var(e)n', 'in', 'in/>+tragen', 'in/+tropfen', 'în/+lègen', 'in/>+winten', 'în/+brèngen', 'in/>+büègen', 'ërr', 'în/+zièhen', 'in/<.+gân', 'in/+zièhen', 'in/>+tûchen', 'dër', 'în/dâr+', 'in/war(e).+', 'in/<.+lâzen', 'in/>+rîten', 'în/+lâzen', 'in/>+lâzen', 'in/+stapfen', 'în/+sènten', 'in/>.+lâzen', 'in/>+stân', 'in/+drücken', 'in/>+ligen', 'in/dâr+ ', 'in/+var(e)n', 'in/+vüèren', 'in/<.+vallen', 'in/>+vlièzen', 'in/<.+rîten', 'in/hër(e).+', 'ne', 'in/>+wonen', 'in/<.+sigel(e)n', 'in/+lègen', 'în/+dringen', 'in/>+ge-trîben', 'in/+diènen', 'in/>+ge-stëchen', 'in/>+stècken', 'in/hër(e)+', 'in/>+stëchen', 'in/dâr+', 'in/+blâsen', 'în/dâr.+', 'in/>+wîsen', 'în/+îlen', 'in/>+laden', 'în/+komen', 'în/+ge-lèiten', 'in/<.+vloèzen', 'ër ', 'in/>+sètzen', 'in/hièr+', 'in/>+bûwen', 'in/>+lèiten', 'în/+ge-binten', '[!]', 'în/+trîben', 'in/<.+blâsen', 'in/+komen', 'în/+krièchen', 'in/+trîben', 'in/<.+ligen', 'in/+stëchen', 'in/<+gân', 'in/dâr.+', 'în/hër(e)+', 'in/+kêren', 'in/<.+var(e)n', 'in/+rîten', 'in/>+vallen', 'in/<.+vüèren'}), ('alten', {'alt', 'alter', 'alten'}), ('mæren', {'mæren', 'mære'})]
POS tagging
===========

.. code-block:: python
In [1]: from cltk.tag.pos import POSTag
In [2]: mhg_pos_tagger = POSTag("middle_high_german")
Expand Down

0 comments on commit e7ba658

Please sign in to comment.