# Nibelungenlied in Referenzkorpus  Mittelhochdeutsch

In [1]:
import os
import collections
from lxml import etree

In [2]:
link = "https://www.linguistics.rub.de/rem/access/index.html"

In [3]:
filename = "M321-G1.xml"

In [4]:
stringify = etree.XPath("string()")
parser = etree.XMLParser(load_dtd=True, no_network=False)
tree = etree.parse(os.path.join(filename), parser=parser)

In [5]:
root = tree.getroot()
decoded_root = etree.tostring(root).decode("utf-8")
decoded_root[:100]

'<text id="M321">\n  <header>\n    <text>Nibelungenlied</text>\n    <abbr_ddd>Nib</abbr_ddd>\n    <abbr_m'

In [6]:
def extract_annotations(entry):
    return {child.tag: child.get("tag") for child in entry.getchildren()}

In [7]:
tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")]

In [8]:
def extract_by_tag(tag, tokens):
    return [token[tag] for token in tokens if tag in token]

In [9]:
normalized_text = extract_by_tag("norm", tokens)

## Lemmata

In [10]:
lemmata = extract_by_tag("lemma", tokens)

In [11]:
counter_nibelungenlied = collections.Counter(lemmata)

In [12]:
counter_nibelungenlied.most_common(20)

[('dër', 1536),
 ('ër', 976),
 ('ich', 282),
 ('dô', 274),
 ('sîn', 256),
 ('vil(e)', 248),
 ('unte', 247),
 ('von', 197),
 ('haben', 191),
 ('wësen', 189),
 ('ir', 184),
 ('ze', 175),
 ('mit', 168),
 ('dazz', 166),
 ('ir(e)', 139),
 ('sô', 136),
 ('in', 131),
 ('dâr', 127),
 ('sprëchen', 117),
 ('küni(n)g', 113)]

In [13]:
lemmata_set = set(lemmata)

In [14]:
len(lemmata), len(lemmata_set)

(15073, 1374)

In [15]:
normalized_to_lemma = {token["norm"]: token["lemma"] for token in tokens if "lemma" in token}

In [16]:
lemma_to_normalized = {lemma: {token["norm"] for token in tokens 
                               if "norm" in token and "lemma" in token and token["lemma"] == lemma}
                       for lemma in lemmata_set}

In [17]:
lemma_to_normalized["Nibelung"]

{'Nibelunc', 'Nibelunge', 'Nibelungen', 'Nibelunges'}

## *Pars oratori*

In [18]:
pos_tags = extract_by_tag("pos", tokens)

In [19]:
pos_set = set(pos_tags)

In [20]:
len(pos_tags), len(pos_set)

(18499, 58)

In [21]:
pos_to_lemmata = {pos: {token["lemma"] for token in tokens 
                               if "lemma" in token and "pos" in token and token["pos"] == pos}
                       for pos in pos_set}

In [22]:
sorted(list(pos_set))[0] # originale Interpunktion

'$_'

In [23]:
pos_to_lemmata["$_"]

set()

In [24]:
[pos for pos in pos_set if pos.startswith("ADJ")] # Adjectives

['ADJA', 'ADJD', 'ADJS', 'ADJN']

In [25]:
# pos_to_lemmata["ADJA"]

In [26]:
pos_to_lemmata["ADJS"]

{'hundert', 'lëben', 'tûsent', 'wërben'}

In [27]:
pos_to_lemmata["ADJD"]

{'all-wâre',
 'arm',
 'be-kant',
 'be-rèit(e)',
 'be-rèitet',
 'blôz',
 'brèit',
 'er-kant',
 'ge-hazz',
 'ge-muot',
 'ge-mèit',
 'ge-nædig-lich',
 'ge-rëht',
 'ge-sunt',
 'ge-sëzzen',
 'ge-triuwe',
 'ge-tân',
 'ge-waltig',
 'ge-wiss',
 'grimm(e)',
 'grème-lich',
 'grôz',
 'guot',
 'gërn(e)',
 'hold',
 'hêr-lich',
 'hövisch',
 'kund',
 'künde',
 'küène',
 'lang',
 'lièb',
 'lièht',
 'lobe-lich',
 'lære',
 'lèid',
 'michel',
 'milte',
 'nazz',
 'niuwe',
 'offen',
 'rëht',
 'rîche',
 'rôt',
 'scharpf',
 'schoène',
 'schuldig',
 'schîn',
 'sippe',
 'stark',
 'swære',
 'sælig',
 'tiur(e)',
 'tièf',
 'trûrig',
 'trüèbe',
 'tôt',
 'un-be-kant',
 'un-ge-bunten',
 'un-ge-logen',
 'un-ge-schèiden',
 'un-ge-tèilet',
 'un-künde',
 'un-müge-lich',
 'un-müèzig',
 'un-ver-daget',
 'un-ver-diènet',
 'un-ver-zaget',
 'unter-tân',
 'vroè-lich',
 'vrum',
 'vrèm(e)de',
 'vrô',
 'wille-komen',
 'wît',
 'zornig',
 'èdel(e)',
 'èl-lènte',
 'übel',
 'über-müète'}

In [28]:
pos_to_lemmata["ADJN"]

{'bald',
 'bi-dèrbe',
 'brèit',
 'ge-mèit',
 'ge-tân',
 'grôz',
 'guot',
 'hêr(e)',
 'hôh',
 'jung',
 'küène',
 'lang',
 'lièht-ge-varw',
 'lobe-bære',
 'lobe-lich',
 'lobe-sam',
 'mære',
 'niuwe',
 'rîche',
 'rôt',
 'schoène',
 'snëll',
 'stark',
 'swind(e)',
 'tièf',
 'vrèm(e)de',
 'vèste',
 'wilde',
 'wît',
 'èdel(e)',
 'ûz-er-kor(e)n'}

In [29]:
[pos for pos in pos_set if pos.startswith("AP")] # Prepositions

['APPR']

In [30]:
pos_to_lemmata["APPR"]

{'abe',
 'after',
 'ane',
 'bî',
 'durh',
 'gègen',
 'hinter',
 'in',
 'mit',
 'mit-same(n)t',
 'nâh',
 'umbe',
 'unter',
 'von',
 'vor(e)',
 'vür(e)',
 'wider',
 'ze',
 'zuo-ze',
 'zwischen',
 'âne',
 'ûf',
 'ûz',
 'ûzer',
 'über'}

In [31]:
[pos for pos in pos_set if pos.startswith("AV")] # Adverbs

['AVW', 'AVG', 'AVD']

In [32]:
# [pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("AV")]]

In [33]:
# Numbers
[pos_to_lemmata[pos] for pos in pos_set if pos.startswith("CAR")]

[{'drî',
  'siben',
  'siben-zëhen',
  'vièr',
  'zwèin-zig',
  'zwèl(i)v',
  'zwêne',
  'èin'},
 {'vièr', 'zwèl(i)v', 'zwêne', 'èin'}]

In [34]:
# definite determinants
[pos for pos in pos_set if pos.startswith("DD")]

['DDS', 'DDA', 'DDN', 'DDART']

In [35]:
[ pos_to_lemmata[pos] for pos in [pos for pos in [pos for pos in pos_set if pos.startswith("DD")]]]

[{'dër', 'sëlb'}, {'dise', 'jèner', 'so-l(i)ch', 'sëlb'}, {'sëlb'}, {'dër'}]

In [36]:
# general determinants
[pos for pos in pos_set if pos.startswith("DG")]

['DGA']

In [37]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DG")]]

[{'swèl(i)ch', 'swëder'}]

In [38]:
# Indefinite determinants
[pos for pos in pos_set if pos.startswith("DI")]

['DIN', 'DIA', 'DID', 'DIART', 'DIS']

In [39]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DI")]]

[{'all', 'bèide'},
 {'all',
  'all-so-l(i)ch',
  'bèide',
  'de-wëder',
  'deh-èin',
  'iè-mann',
  'ièd-wëder',
  'iètes-lich',
  'manig',
  'niè-mann',
  'èin',
  'èin-ander',
  'ëte(s)-lich'},
 {'bèide'},
 {'èin'},
 {'all', 'ander', 'bèide', 'deh-èin', 'iètes-lich', 'manig'}]

In [40]:
# possessive determinants
[pos for pos in pos_set if pos.startswith("DP")]

['DPOSN', 'DPOSA', 'DPOSS']

In [41]:
[ pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DP")]]

[{'dîn', 'mîn', 'sîn'},
 {'dîn', 'ir(e)', 'iuwer', 'mîn', 'sîn', 'unser'},
 {'mîn', 'sîn'}]

In [42]:
# relative determinants
[pos for pos in pos_set if pos.startswith("DR")]

['DRELS']

In [43]:
pos_to_lemmata["DRELS"]

{'dër'}

In [44]:
# interrogative determinants
[pos for pos in pos_set if pos.startswith("DW")]

[]

In [45]:
# foreign words
[pos for pos in pos_set if pos.startswith("FM")]

[]

In [46]:
# Interjection
[pos for pos in pos_set if pos.startswith("ITJ")]

['ITJ']

In [47]:
pos_to_lemmata["ITJ"]

{'ach', 'hèi', 'wâfen', 'wê', 'ô'}

In [48]:
# Conjunctions
[pos for pos in pos_set if pos.startswith("KO")]

['KO*', 'KON', 'KOUS', 'KOKOM']

In [49]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("KO")]]

[{'all-sam(e)',
  'all-sô',
  'sam(e)',
  'swanne~dazz',
  'sît~dazz',
  'sô',
  'wante',
  'wëder',
  'ê~dazz'},
 {'bèide', 'dës', 'noh', 'oder', 'unte'},
 {'danne',
  'dazz',
  'diè~wîle',
  'durh~dazz',
  'dô',
  'nû',
  'obe',
  'sîd',
  'unz(e)',
  'unz(e)~dazz',
  'wan',
  'wan~dazz',
  'êr'},
 {'danne', 'wan'}]

In [50]:
# Nouns
[pos for pos in pos_set if pos.startswith("N")]

['NE', 'NA']

In [51]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("DI")]]

[{'all', 'bèide'},
 {'all',
  'all-so-l(i)ch',
  'bèide',
  'de-wëder',
  'deh-èin',
  'iè-mann',
  'ièd-wëder',
  'iètes-lich',
  'manig',
  'niè-mann',
  'èin',
  'èin-ander',
  'ëte(s)-lich'},
 {'bèide'},
 {'èin'},
 {'all', 'ander', 'bèide', 'deh-èin', 'iètes-lich', 'manig'}]

In [52]:
# pronominal adverbs
[pos for pos in pos_set if pos.startswith("PA")]

['PAVG', 'PAVAP', 'PAVD', 'PAVW']

In [53]:
[pos_to_lemmata[pos] for pos in [pos for pos in pos_set if pos.startswith("PA")]]

[{'swâr/+nâh'},
 {'ane/dâr+',
  'ane/dâr.+',
  'bî/dâr+',
  'hèime/dâr+',
  'inne/dâr+',
  'inne/hièr+',
  'mit(e)/dâr+',
  'mit(e)/wâr+',
  'nâh/dâr+',
  'nâh/swâr+',
  'umbe/dâr+',
  'umbe/wâr+',
  'unter/dâr+',
  'unter/dâr.+',
  'von(e)/dâr+',
  'von(e)/wâr+',
  'vor(e)/dâr+',
  'wider(e)/dâr+',
  'zuo/dâr+',
  'în/dâr+',
  'über(e)/dâr+'},
 {'dâr/+ane',
  'dâr/+bî',
  'dâr/+hèime',
  'dâr/+inne',
  'dâr/+mit(e)',
  'dâr/+nâh',
  'dâr/+umbe',
  'dâr/+unter',
  'dâr/+von(e)',
  'dâr/+vor(e)',
  'dâr/+wider(e)',
  'dâr/+zuo',
  'dâr/+în',
  'dâr/+über(e)',
  'dâr/.+ane',
  'dâr/.+unter.+',
  'hièr/+inne'},
 {'wâr/+mit(e)', 'wâr/+umbe', 'wâr/+von(e)'}]

In [54]:
# Pronouns
[pos for pos in pos_set 
 if pos in ["PG", "PI", "PPER", "PRF", "PW"]]

['PI', 'PW', 'PRF', 'PPER', 'PG']

In [55]:
[pos_to_lemmata[pos] for pos in ["PG", "PI", "PPER", "PRF", "PW"]]

[{'swër'},
 {'man'},
 {'dû', 'ich', 'ir', 'wir', 'ër'},
 {'dû', 'ich', 'ir', 'sich', 'ër'},
 {'wër'}]

In [56]:
# Particles
[pos for pos in pos_set if pos.startswith("PT")]

['PTKVZ', 'PTKNEG', 'PTK', 'PTKANT', 'PTKA']

In [57]:
[pos_to_lemmata[pos] for pos in 
 [pos for pos in pos_set if pos.startswith("PT")]]

[{'abe/+lâzen',
  'abe/+slahen',
  'ane/+bièten',
  'ane/+er-dwingen',
  'ane/+ge-vâhen',
  'ane/+ge-winnen',
  'ane/+hèben',
  'ane/+kapfen',
  'ane/+loufen',
  'ane/+ruofen',
  'ane/+schiffen',
  'bî/+wësen',
  'innen/+bringen',
  'mit(e)/+volgen',
  'nider(e)/+gân',
  'nider(e)/+rîten',
  'nider(e)/+vallen',
  'vür(e)/+sènten',
  'wider(e)/+ge-winnen',
  'wider(e)/+gëben',
  'wider(e)/+slahen',
  'ûf/+blicken',
  'ûf/+rihten',
  'ûf/+spannen',
  'ûf/+tuon',
  'ûf/+îlen',
  'ûz/+rîten/+gân'},
 {'ne', 'niht'},
 {'sô'},
 {'jâ'},
 {'ze'}]

In [58]:
# Auxiliary verbs
[pos for pos in pos_set if pos.startswith("VA")]

['VAIMP', 'VAFIN', 'VAPP', 'VAINF']

In [59]:
[pos_to_lemmata[pos] for pos in
 [pos for pos in pos_set if pos.startswith("VA")]]

[{'sîn'},
 {'haben', 'sîn', 'wërden', 'wësen', 'wësen/bî+'},
 {'wërden', 'wësen'},
 {'haben', 'sîn', 'wërden', 'wësen'}]

In [60]:
# modal verbs
[pos for pos in pos_set if pos.startswith("VM")]

['VMINF', 'VMFIN']

In [61]:
[pos_to_lemmata[pos] for pos in 
 [pos for pos in pos_set if pos.startswith("VM")]]

[{'wèllen'},
 {'durfen',
  'ge-turren',
  'kunnen',
  'mügen',
  'müèzen',
  'sol(e)n',
  'turren',
  'wèllen'}]

In [62]:
# Complete verbs
[pos for pos in pos_set if pos.startswith("VV")]

['VVFIN', 'VVPP', 'VVPS', 'VVINF', 'VVIMP']

In [63]:
# [pos_to_lemmata[pos] for pos in 
#  [pos for pos in pos_set if pos.startswith("VV")]]

## Morphology

In [64]:
inflections = extract_by_tag("infl", tokens)

In [65]:
len(inflections), len(set(inflections))

(15073, 283)

In [66]:
inflection_class = extract_by_tag("inflClass", tokens)

In [67]:
len(inflection_class), len(set(inflection_class))

(15073, 40)