# Nibelungenlied in Referenzkorpus  Mittelhochdeutsch

In [27]:
import os
import collections
from lxml import etree

In [3]:
link = "https://www.linguistics.rub.de/rem/access/index.html"

In [4]:
filename = "M321-G1.xml"

In [5]:
stringify = etree.XPath("string()")
parser = etree.XMLParser(load_dtd=True, no_network=False)
tree = etree.parse(os.path.join(filename), parser=parser)

In [6]:
root = tree.getroot()
decoded_root = etree.tostring(root).decode("utf-8")
decoded_root[:100]

'<text id="M321">\n  <header>\n    <text>Nibelungenlied</text>\n    <abbr_ddd>Nib</abbr_ddd>\n    <abbr_m'

In [8]:
def extract_annotations(entry):
    return {child.tag: child.get("tag") for child in entry.getchildren()}

In [9]:
tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")]

In [11]:
def extract_by_tag(tag, tokens):
    return [token[tag] for token in tokens if tag in token]

In [9]:
normalized_text = extract_by_tag("norm", tokens)

## Lemmata

In [12]:
lemmata = extract_by_tag("lemma", tokens)

In [29]:
counter_nibelungenlied = collections.Counter(lemmata)

In [32]:
counter_nibelungenlied.most_common(20)

[('dër', 1536),
 ('ër', 976),
 ('ich', 282),
 ('dô', 274),
 ('sîn', 256),
 ('vil(e)', 248),
 ('unte', 247),
 ('von', 197),
 ('haben', 191),
 ('wësen', 189),
 ('ir', 184),
 ('ze', 175),
 ('mit', 168),
 ('dazz', 166),
 ('ir(e)', 139),
 ('sô', 136),
 ('in', 131),
 ('dâr', 127),
 ('sprëchen', 117),
 ('küni(n)g', 113)]

In [13]:
lemmata_set = set(lemmata)

In [14]:
len(lemmata), len(lemmata_set)

(15073, 1374)

In [16]:
normalized_to_lemma = {token["norm"]: token["lemma"] for token in tokens if "lemma" in token}

In [25]:
lemma_to_normalized = {lemma: {token["norm"] for token in tokens 
                               if "norm" in token and "lemma" in token and token["lemma"] == lemma}
                       for lemma in lemmata_set}

In [26]:
lemma_to_normalized["Nibelung"]

{'Nibelunc', 'Nibelunge', 'Nibelungen', 'Nibelunges'}

## *Pars oratori*

In [33]:
pos_tags = extract_by_tag("pos", tokens)

In [34]:
pos_set = set(pos_tags)

In [35]:
len(pos_tags), len(pos_set)

(18499, 58)

In [76]:
pos_to_lemmata = {pos: {token["lemma"] for token in tokens 
                               if "lemma" in token and "pos" in token and token["pos"] == pos}
                       for pos in pos_set}

In [40]:
sorted(list(pos_set))[0] # originale Interpunktion

'$_'

In [77]:
pos_to_lemmata["$_"]

set()

In [49]:
[pos for pos in pos_set if pos.startswith("ADJ")] # Adjectives

['ADJA', 'ADJS', 'ADJD', 'ADJN']

In [80]:
pos_to_lemmata["ADJA"]

{'aht-zëhent',
 'aller-bèzzest',
 'aller-groèzest',
 'aller-mèist',
 'alt',
 'angest-lich',
 'arg',
 'arm',
 'bluotig',
 'bè(zzi)st',
 'bèzzer',
 'dritt',
 'ganz',
 'ge-mèine',
 'ge-nuog(e)',
 'ge-triuwe',
 'ge-triuwe-lich',
 'grimm(e)',
 'grimmig',
 'grôz',
 'grôz-lich',
 'guot',
 'guot-lich',
 'gæhe',
 'hol',
 'hold',
 'hundert',
 'hèiden',
 'hèiz',
 'hèrte',
 'hêr-lich',
 'hôh',
 'hôh-ge-muot',
 'ite-niuwe',
 'jung',
 'jâmer-haft',
 'klingen',
 'klèine',
 'kristen',
 'kristen-lich',
 'krèftig',
 'krèftig-lich',
 'kund',
 'kurz',
 'küène',
 'lièb',
 'lièht',
 'lobe-lich',
 'lèid',
 'lèid-lich',
 'lèzzest',
 'michel',
 'milte',
 'minnig-lich',
 'mitte',
 'mânig',
 'mêr(e)',
 'nagel(e)n',
 'niuwe',
 'nâh',
 'rëht',
 'rîche',
 'rîtære-lich',
 'rôt',
 'scharpf',
 'schoène',
 'sibent',
 'slahen',
 'snëll',
 'stark',
 'stolz',
 'strît-müède',
 'sturm-küène',
 'stæte',
 'swach',
 'swiè-ge-tân',
 'sèmfte',
 'sëhst',
 'sëlt-sæne',
 'süèze',
 'tièf',
 'tumb',
 'tuon',
 'tôd-wunt',
 'tûsent',
 

In [81]:
pos_to_lemmata["ADJS"]

{'hundert', 'lëben', 'tûsent', 'wërben'}

In [82]:
pos_to_lemmata["ADJD"]

{'all-wâre',
 'arm',
 'be-kant',
 'be-rèit(e)',
 'be-rèitet',
 'blôz',
 'brèit',
 'er-kant',
 'ge-hazz',
 'ge-muot',
 'ge-mèit',
 'ge-nædig-lich',
 'ge-rëht',
 'ge-sunt',
 'ge-sëzzen',
 'ge-triuwe',
 'ge-tân',
 'ge-waltig',
 'ge-wiss',
 'grimm(e)',
 'grème-lich',
 'grôz',
 'guot',
 'gërn(e)',
 'hold',
 'hêr-lich',
 'hövisch',
 'kund',
 'künde',
 'küène',
 'lang',
 'lièb',
 'lièht',
 'lobe-lich',
 'lære',
 'lèid',
 'michel',
 'milte',
 'nazz',
 'niuwe',
 'offen',
 'rëht',
 'rîche',
 'rôt',
 'scharpf',
 'schoène',
 'schuldig',
 'schîn',
 'sippe',
 'stark',
 'swære',
 'sælig',
 'tiur(e)',
 'tièf',
 'trûrig',
 'trüèbe',
 'tôt',
 'un-be-kant',
 'un-ge-bunten',
 'un-ge-logen',
 'un-ge-schèiden',
 'un-ge-tèilet',
 'un-künde',
 'un-müge-lich',
 'un-müèzig',
 'un-ver-daget',
 'un-ver-diènet',
 'un-ver-zaget',
 'unter-tân',
 'vroè-lich',
 'vrum',
 'vrèm(e)de',
 'vrô',
 'wille-komen',
 'wît',
 'zornig',
 'èdel(e)',
 'èl-lènte',
 'übel',
 'über-müète'}

In [83]:
pos_to_lemmata["ADJN"]

{'bald',
 'bi-dèrbe',
 'brèit',
 'ge-mèit',
 'ge-tân',
 'grôz',
 'guot',
 'hêr(e)',
 'hôh',
 'jung',
 'küène',
 'lang',
 'lièht-ge-varw',
 'lobe-bære',
 'lobe-lich',
 'lobe-sam',
 'mære',
 'niuwe',
 'rîche',
 'rôt',
 'schoène',
 'snëll',
 'stark',
 'swind(e)',
 'tièf',
 'vrèm(e)de',
 'vèste',
 'wilde',
 'wît',
 'èdel(e)',
 'ûz-er-kor(e)n'}

In [51]:
[pos for pos in pos_set if pos.startswith("AP")] # Prepositions

['APPR']

In [85]:
pos_to_lemmata["APPR"]

{'abe',
 'after',
 'ane',
 'bî',
 'durh',
 'gègen',
 'hinter',
 'in',
 'mit',
 'mit-same(n)t',
 'nâh',
 'umbe',
 'unter',
 'von',
 'vor(e)',
 'vür(e)',
 'wider',
 'ze',
 'zuo-ze',
 'zwischen',
 'âne',
 'ûf',
 'ûz',
 'ûzer',
 'über'}

In [52]:
[pos for pos in pos_set if pos.startswith("AV")] # Adverbs

['AVD', 'AVW', 'AVG']

In [86]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("AV")]]

[{'aber',
  'al',
  'all-dâr',
  'all-sam(e)',
  'all-sô',
  'all-umbe',
  'all-ze-hant',
  'all-èin(e)',
  'allen-halb(en)',
  'aller-êr(e)ste',
  'alter(e)s-èine',
  'ander(e)s',
  'ander(e)s-wâr',
  'ane',
  'angest-lîche',
  'balde',
  'bazz',
  'be-gègene',
  'bil-lîche',
  'bitter-lîche',
  'bè(zzi)ste',
  'dan-noh',
  'danne',
  'dannen',
  'dar(e)',
  'dicke',
  'doh',
  'dort',
  'dâr',
  'dëgen-lîche',
  'dëste',
  'dô',
  'en-gègen(e)',
  'er-bolgen-lîche',
  'gar(e)',
  'ge-lîche',
  'ge-nuog(e)',
  'ge-waltig-lîche',
  'grimme',
  'grème-lîche',
  'grôz-lîche',
  'grôze',
  'guot-lîche',
  'gërne',
  'halt',
  'harte',
  'hin(e)',
  'hinne',
  'hinnen',
  'hiute',
  'hièr',
  'hurtig-lîche',
  'hèim',
  'hèime-lîche',
  'hèize',
  'hêr-lîche',
  'hêrren-lîche',
  'hër(e)',
  'hërze(n)-lîche',
  'hôhe',
  'iht',
  'innig-lîche',
  'iè',
  'iè-doh',
  'ièmer',
  'joh',
  'jungest(e)',
  'jâ',
  'jâmer-lîche',
  'krèftig-lîche',
  'kûme',
  'lange',
  'laster-lîche',
  'lièbe

In [87]:
# Numbers
[pos_to_lemmata[pos] for pos in pos_set if pos.startswith("CAR")]

[{'vièr', 'zwèl(i)v', 'zwêne', 'èin'},
 {'drî',
  'siben',
  'siben-zëhen',
  'vièr',
  'zwèin-zig',
  'zwèl(i)v',
  'zwêne',
  'èin'}]

In [55]:
# definite determinants
[pos for pos in pos_set if pos.startswith("DD")]

['DDA', 'DDS', 'DDN', 'DDART']

In [89]:
[ pos_to_lemmata[pos] for pos in [pos for pos in [pos for pos in pos_set if pos.startswith("DD")]]]

[{'dise', 'jèner', 'so-l(i)ch', 'sëlb'}, {'dër', 'sëlb'}, {'sëlb'}, {'dër'}]

In [56]:
# general determinants
[pos for pos in pos_set if pos.startswith("DG")]

['DGA']

In [91]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DG")]]

[{'swèl(i)ch', 'swëder'}]

In [57]:
# Indefinite determinants
[pos for pos in pos_set if pos.startswith("DI")]

['DIS', 'DIN', 'DIART', 'DIA', 'DID']

In [92]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DI")]]

[{'all', 'ander', 'bèide', 'deh-èin', 'iètes-lich', 'manig'},
 {'all', 'bèide'},
 {'èin'},
 {'all',
  'all-so-l(i)ch',
  'bèide',
  'de-wëder',
  'deh-èin',
  'iè-mann',
  'ièd-wëder',
  'iètes-lich',
  'manig',
  'niè-mann',
  'èin',
  'èin-ander',
  'ëte(s)-lich'},
 {'bèide'}]

In [58]:
# possessive determinants
[pos for pos in pos_set if pos.startswith("DP")]

['DPOSN', 'DPOSS', 'DPOSA']

In [94]:
[ pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DP")]]

[{'dîn', 'mîn', 'sîn'},
 {'mîn', 'sîn'},
 {'dîn', 'ir(e)', 'iuwer', 'mîn', 'sîn', 'unser'}]

In [61]:
# relative determinants
[pos for pos in pos_set if pos.startswith("DR")]

['DRELS']

In [96]:
pos_to_lemmata["DRELS"]

{'dër'}

In [63]:
# interrogative determinants
[pos for pos in pos_set if pos.startswith("DW")]

[]

In [64]:
# foreign words
[pos for pos in pos_set if pos.startswith("FM")]

[]

In [66]:
# Interjection
[pos for pos in pos_set if pos.startswith("ITJ")]

['ITJ']

In [97]:
pos_to_lemmata["ITJ"]

{'ach', 'hèi', 'wâfen', 'wê', 'ô'}

In [67]:
# Conjunctions
[pos for pos in pos_set if pos.startswith("KO")]

['KOUS', 'KON', 'KOKOM', 'KO*']

In [98]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("KO")]]

[{'danne',
  'dazz',
  'diè~wîle',
  'durh~dazz',
  'dô',
  'nû',
  'obe',
  'sîd',
  'unz(e)',
  'unz(e)~dazz',
  'wan',
  'wan~dazz',
  'êr'},
 {'bèide', 'dës', 'noh', 'oder', 'unte'},
 {'danne', 'wan'},
 {'all-sam(e)',
  'all-sô',
  'sam(e)',
  'swanne~dazz',
  'sît~dazz',
  'sô',
  'wante',
  'wëder',
  'ê~dazz'}]

In [68]:
# Nouns
[pos for pos in pos_set if pos.startswith("N")]

['NA', 'NE']

In [99]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DI")]]

[{'all', 'ander', 'bèide', 'deh-èin', 'iètes-lich', 'manig'},
 {'all', 'bèide'},
 {'èin'},
 {'all',
  'all-so-l(i)ch',
  'bèide',
  'de-wëder',
  'deh-èin',
  'iè-mann',
  'ièd-wëder',
  'iètes-lich',
  'manig',
  'niè-mann',
  'èin',
  'èin-ander',
  'ëte(s)-lich'},
 {'bèide'}]

In [69]:
# pronominal adverbs
[pos for pos in pos_set if pos.startswith("PA")]

['PAVG', 'PAVW', 'PAVD', 'PAVAP']

In [100]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("PA")]]

[{'swâr/+nâh'},
 {'wâr/+mit(e)', 'wâr/+umbe', 'wâr/+von(e)'},
 {'dâr/+ane',
  'dâr/+bî',
  'dâr/+hèime',
  'dâr/+inne',
  'dâr/+mit(e)',
  'dâr/+nâh',
  'dâr/+umbe',
  'dâr/+unter',
  'dâr/+von(e)',
  'dâr/+vor(e)',
  'dâr/+wider(e)',
  'dâr/+zuo',
  'dâr/+în',
  'dâr/+über(e)',
  'dâr/.+ane',
  'dâr/.+unter.+',
  'hièr/+inne'},
 {'ane/dâr+',
  'ane/dâr.+',
  'bî/dâr+',
  'hèime/dâr+',
  'inne/dâr+',
  'inne/hièr+',
  'mit(e)/dâr+',
  'mit(e)/wâr+',
  'nâh/dâr+',
  'nâh/swâr+',
  'umbe/dâr+',
  'umbe/wâr+',
  'unter/dâr+',
  'unter/dâr.+',
  'von(e)/dâr+',
  'von(e)/wâr+',
  'vor(e)/dâr+',
  'wider(e)/dâr+',
  'zuo/dâr+',
  'în/dâr+',
  'über(e)/dâr+'}]

In [70]:
# Pronouns
[pos for pos in pos_set 
 if pos in ["PG", "PI", "PPER", "PRF", "PW"]]

['PPER', 'PW', 'PI', 'PG', 'PRF']

In [101]:
[pos_to_lemmata[pos] for pos in ["PG", "PI", "PPER", "PRF", "PW"]]

[{'swër'},
 {'man'},
 {'dû', 'ich', 'ir', 'wir', 'ër'},
 {'dû', 'ich', 'ir', 'sich', 'ër'},
 {'wër'}]

In [71]:
# Particles
[pos for pos in pos_set if pos.startswith("PT")]

['PTKANT', 'PTKA', 'PTKNEG', 'PTKVZ', 'PTK']

In [102]:
[pos_to_lemmata[pos] for pos in 
 [pos for pos in pos_set if pos.startswith("PT")]]

[{'jâ'},
 {'ze'},
 {'ne', 'niht'},
 {'abe/+lâzen',
  'abe/+slahen',
  'ane/+bièten',
  'ane/+er-dwingen',
  'ane/+ge-vâhen',
  'ane/+ge-winnen',
  'ane/+hèben',
  'ane/+kapfen',
  'ane/+loufen',
  'ane/+ruofen',
  'ane/+schiffen',
  'bî/+wësen',
  'innen/+bringen',
  'mit(e)/+volgen',
  'nider(e)/+gân',
  'nider(e)/+rîten',
  'nider(e)/+vallen',
  'vür(e)/+sènten',
  'wider(e)/+ge-winnen',
  'wider(e)/+gëben',
  'wider(e)/+slahen',
  'ûf/+blicken',
  'ûf/+rihten',
  'ûf/+spannen',
  'ûf/+tuon',
  'ûf/+îlen',
  'ûz/+rîten/+gân'},
 {'sô'}]

In [72]:
# Auxiliary verbs
[pos for pos in pos_set if pos.startswith("VA")]

['VAFIN', 'VAIMP', 'VAPP', 'VAINF']

In [103]:
[pos_to_lemmata[pos] for pos in
 [pos for pos in pos_set if pos.startswith("VA")]]

[{'haben', 'sîn', 'wërden', 'wësen', 'wësen/bî+'},
 {'sîn'},
 {'wërden', 'wësen'},
 {'haben', 'sîn', 'wërden', 'wësen'}]

In [74]:
# modal verbs
[pos for pos in pos_set if pos.startswith("VM")]

['VMFIN', 'VMINF']

In [104]:
[pos_to_lemmata[pos] for pos in 
 [pos for pos in pos_set if pos.startswith("VM")]]

[{'durfen',
  'ge-turren',
  'kunnen',
  'mügen',
  'müèzen',
  'sol(e)n',
  'turren',
  'wèllen'},
 {'wèllen'}]

In [73]:
# Complete verbs
[pos for pos in pos_set if pos.startswith("VV")]

['VVFIN', 'VVINF', 'VVIMP', 'VVPS', 'VVPP']

In [105]:
[pos_to_lemmata[pos] for pos in 
 [pos for pos in pos_set if pos.startswith("VV")]]

[{'ant-würten',
  'baden',
  'be-durfen',
  'be-dwingen',
  'be-ginnen',
  'be-graben',
  'be-gân',
  'be-hagen',
  'be-kènnen',
  'be-kêren',
  'be-lèiten',
  'be-rèiten',
  'be-schèiden',
  'be-sitzen',
  'be-slièzen',
  'be-swæren',
  'be-trüèben',
  'be-vinden',
  'be-war(e)n',
  'be-wëgen',
  'binten',
  'biten',
  'bièten',
  'blicken/ûf+',
  'bringen',
  'bringen/innen+',
  'brinnen',
  'briuten',
  'brëchen',
  'bèiten',
  'bîten',
  'dagen',
  'diènen',
  'dièzen',
  'dringen',
  'dunken',
  'dwingen',
  'dènken',
  'ent-bièten',
  'ent-rihten',
  'ent-rüsten',
  'ent-trinnen',
  'ent-vallen',
  'ent-vinden',
  'ent-vâhen',
  'ent-wîchen',
  'er-biten',
  'er-bièten',
  'er-bîten',
  'er-dièzen',
  'er-dwingen',
  'er-gâhen',
  'er-gân',
  'er-gètzen',
  'er-gëben',
  'er-hoèren',
  'er-kièsen',
  'er-klingen',
  'er-krimmen',
  'er-kènnen',
  'er-schrècken',
  'er-sëhen',
  'er-vinden',
  'er-wërben',
  'ge-biten',
  'ge-bièten',
  'ge-brësten',
  'ge-bâren',
  'ge-dènken',
 

## Morphology

In [15]:
inflections = extract_by_tag("infl", tokens)

In [16]:
len(inflections), len(set(inflections))

(15073, 283)

In [17]:
inflection_class = extract_by_tag("inflClass", tokens)

In [18]:
len(inflection_class), len(set(inflection_class))

(15073, 40)