Skip to content

Commit

Permalink
Middle High German POS taggers (#950)
Browse files Browse the repository at this point in the history
* Added middle_high_german for POS tagging

* Added POS taggers for Middle High German

* Fixed indents

Co-authored-by: Todd Cook <665389+todd-cook@users.noreply.github.com>
  • Loading branch information
clemsciences and todd-cook committed Jun 7, 2020
1 parent fe31ab7 commit cd920c5
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 43 deletions.
6 changes: 6 additions & 0 deletions cltk/corpus/middle_high_german/corpora.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
MIDDLE_HIGH_GERMAN_CORPORA = [{
'name': 'middle_high_german_models_cltk',
'origin': 'https://github.com/cltk/middle_high_german_models_cltk.git',
'location': 'remote',
'type': 'model'
}]
5 changes: 3 additions & 2 deletions cltk/corpus/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from cltk.corpus.marathi.corpora import MARATHI_CORPORA
from cltk.corpus.gujarati.corpora import GUJARATI_CORPORA
from cltk.corpus.middle_low_german.corpora import MIDDLE_LOW_GERMAN_CORPORA
from cltk.corpus.middle_high_german.corpora import MIDDLE_HIGH_GERMAN_CORPORA

from cltk.utils.cltk_logger import logger

Expand Down Expand Up @@ -78,8 +79,8 @@
'classical_hindi':CLASSICAL_HINDI_CORPORA,
'french':FRENCH_CORPORA,
'gujarati': GUJARATI_CORPORA,
'middle_low_german': MIDDLE_LOW_GERMAN_CORPORA

'middle_low_german': MIDDLE_LOW_GERMAN_CORPORA,
'middle_high_german': MIDDLE_HIGH_GERMAN_CORPORA,
}


Expand Down
82 changes: 47 additions & 35 deletions cltk/tag/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,53 @@

from cltk.utils.file_operations import open_pickle


__author__ = ['Kyle P. Johnson <kyle@kyle-p-johnson.com>']
__license__ = 'MIT License. See LICENSE.'


TAGGERS = {'greek':
{'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': '123grambackoff.pickle',
'tnt': 'tnt.pickle',
'crf': 'crf.pickle',
},
'latin':
{'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': '123grambackoff.pickle',
'tnt': 'tnt.pickle',
'crf': 'crf.pickle',
},
'old_norse':
{'tnt': 'tnt.pickle'
},
'middle_low_german':
{'ngram_12_backoff': 'backoff_tagger.pickle'
},
'old_english':
{'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': 'backoff.pickle',
'crf': 'crf.pickle',
'perceptron' : 'perceptron.pickle'
}}
TAGGERS = {
'greek':
{
'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': '123grambackoff.pickle',
'tnt': 'tnt.pickle',
'crf': 'crf.pickle',
},
'latin':
{
'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': '123grambackoff.pickle',
'tnt': 'tnt.pickle',
'crf': 'crf.pickle',
},
'old_norse':
{
'tnt': 'tnt.pickle'
},
'middle_low_german':
{
'ngram_12_backoff': 'backoff_tagger.pickle'
},
'old_english':
{
'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'ngram_123_backoff': 'backoff.pickle',
'crf': 'crf.pickle',
'perceptron': 'perceptron.pickle'
},
'middle_high_german':
{
'unigram': 'unigram.pickle',
'bigram': 'bigram.pickle',
'trigram': 'trigram.pickle',
'tnt': 'tnt.pickle',
},
}


class POSTag:
Expand Down Expand Up @@ -119,7 +131,7 @@ def tag_ngram_123_backoff(self, untagged_string: str):
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text

def tag_ngram_12_backoff(self, untagged_string: str):
"""Tag POS with 1-, 2-gram tagger.
:type untagged_string: str
Expand All @@ -130,8 +142,8 @@ def tag_ngram_12_backoff(self, untagged_string: str):
pickle_path = self.available_taggers['ngram_12_backoff']
tagger = open_pickle(pickle_path)
tagged_text = tagger.tag(untagged_tokens)
return tagged_text
return tagged_text

def tag_tnt(self, untagged_string: str):
"""Tag POS with TnT tagger.
:type untagged_string: str
Expand Down
55 changes: 49 additions & 6 deletions cltk/tests/test_languages/test_middle_high_german.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
"""Test Middle High German"""

import os
import unittest
import unicodedata

from cltk.corpus.middle_high_german.alphabet import normalize_middle_high_german
from cltk.corpus.utils.importer import CorpusImporter
from cltk.stem.middle_high_german.stem import stemmer_middle_high_german as middle_high_german_stemmer
from cltk.stop.middle_high_german.stops import STOPS_LIST as MIDDLE_HIGH_GERMAN_STOPS
from cltk.phonology.middle_high_german import transcription as mhg
from cltk.phonology.syllabify import Syllabifier
from cltk.tag.pos import POSTag
from cltk.tokenize.word import WordTokenizer

__author__ = ['Eleftheria Chatziargyriou <ele.hatzy@gmail.com>']
__author__ = ['Eleftheria Chatziargyriou <ele.hatzy@gmail.com>', 'Clément Besnier <clemsciences@aol.com>']
__license__ = 'MIT License. See LICENSE.'


class TestMiddleHighGerman(unittest.TestCase):
""" Middle High German unit tests"""

def setUp(self):
corpus_importer = CorpusImporter("middle_high_german")
corpus_importer.import_corpus("middle_high_german_models_cltk")
file_rel = os.path.join(get_cltk_data_dir() +
'/middle_high_german/model/middle_high_german_models_cltk/README.md')
file = os.path.expanduser(file_rel)
file_exists = os.path.isfile(file)
self.assertTrue(file_exists)

def test_middle_high_german_tokenize(self):
"""
Test tokenizing Middle High German
Expand All @@ -24,7 +37,8 @@ def test_middle_high_german_tokenize(self):
text = "Mīn ougen wurden liebes alsō vol, \n\n\ndō ich die minneclīchen ērst gesach,\ndaȥ eȥ mir hiute und iemer mē tuot wol."

tokenized = word_tokenizer.tokenize(text)
target = ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', 'eȥ', 'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.']
target = ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō', 'ich', 'die', 'minneclīchen', 'ērst',
'gesach', ',', 'daȥ', 'eȥ', 'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.']

self.assertEqual(tokenized, target)

Expand Down Expand Up @@ -77,7 +91,7 @@ def test_middle_high_german_normalizer_spelling(self):
"""
Test Middle High German spelling normalizer
"""
normalized = normalize_middle_high_german("Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv = True)
normalized = normalize_middle_high_german("Mit ūf erbürten schilden in was ze strīte nōt", alpha_conv=True)
target = 'mit ûf erbürten schilden in was ze strîte nôt'

self.assertEqual(normalized, target)
Expand All @@ -86,8 +100,8 @@ def test_middle_high_german_normalizer(self):
"""
Test Middle High German punctuation normalizer
"""
normalized = normalize_middle_high_german("Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct = True)
target = 'si sprach herre sigemunt ir sult iȥ lâȥen stân'
normalized = normalize_middle_high_german("Si sprach: ‘herre Sigemunt, ir sult iȥ lāȥen stān", punct=True)
target = 'si sprach herre sigemunt ir sult iȥ lâȥen stân'

self.assertEqual(normalized, target)

Expand All @@ -113,7 +127,7 @@ def test_middle_high_german_stemmer_dictionary(self):
"""
Test Middle High German stemmer's user-defined dictionary function
"""
exception_dic = {"biuget" : "biegen"}
exception_dic = {"biuget": "biegen"}
stemmed = middle_high_german_stemmer("swaȥ kriuchet unde fliuget und bein zer erden biuget", rem_umlauts=False,
exceptions=exception_dic)
target = ['swaȥ', 'kriuchet', 'unde', 'fliuget', 'und', 'bein', 'zer', 'erden', 'biegen']
Expand All @@ -130,5 +144,34 @@ def test_middle_high_german_syllabification(self):

self.assertEqual(syllabified, target)

def test_middle_high_german_tnt_pos_tagger(self):
target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'ADJA'),
('wunders', 'NA'), ('vil', 'AVD'), ('geseit', 'VVPP')]
mhg_pos_tagger = POSTag("middle_high_german")
res = mhg_pos_tagger.tag_tnt("uns ist in alten mæren wunders vil geseit")
self.assertEqual(target, res)

def test_middle_high_german_unigram_pos_tagger(self):
target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'ADJA'),
('wunders', 'NA'), ('vil', 'ADJA'), ('geseit', 'VVPP')]
mhg_pos_tagger = POSTag("middle_high_german")
res = mhg_pos_tagger.tag_unigram("uns ist in alten mæren wunders vil geseit")
self.assertEqual(target, res)

def test_middle_high_german_bigram_pos_tagger(self):
target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'NA'),
('wunders', 'NA'), ('vil', None), ('geseit', None)]
mhg_pos_tagger = POSTag("middle_high_german")
res = mhg_pos_tagger.tag_bigram("uns ist in alten mæren wunders vil geseit")
self.assertEqual(target, res)

def test_middle_high_german_trigram_pos_tagger(self):
target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'NA'),
('wunders', 'NA'), ('vil', None), ('geseit', None)]
mhg_pos_tagger = POSTag("middle_high_german")
res = mhg_pos_tagger.tag_trigram("uns ist in alten mæren wunders vil geseit")
self.assertEqual(target, res)


if __name__ == '__main__':
unittest.main()
14 changes: 14 additions & 0 deletions docs/middle_high_german.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,17 @@ The ``WordTokenizer`` class takes a string as input and returns a list of tokens
In [4]: word_tokenizer.tokenize(text)
Out[4]: ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', '', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', '', 'mir', 'hiute', 'und', 'iemer', '', 'tuot', 'wol', '.']
POS tagging
===========

.. code-block:: python
In [1]: from cltk.tag.pos import POSTag
In [2]: mhg_pos_tagger = POSTag("middle_high_german")
In [3]: mhg_pos_tagger.tag_tnt("uns ist in alten mæren wunders vil geseit")
Out[3]: [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'ADJA'),
('wunders', 'NA'), ('vil', 'AVD'), ('geseit', 'VVPP')]

0 comments on commit cd920c5

Please sign in to comment.