# What kind of language is Middle High German?

In [1]:
import sigurd.nib_marburg.mhg_lemmatizer as mhgl
import sigurd.nib_marburg.mhg_pos_tagger as mhgpt

In [2]:
import numpy

## Data retrieval

In [3]:
forms_to_lemmata = mhgl.read_lemmatizer()

In [4]:
lemmata_to_forms = mhgl.read_lemma_to_forms()

In [5]:
list(lemmata_to_forms.keys())[10000]

'bluotig'

In [6]:
lemmata_to_forms['bluotig']

['bluotiger', 'bluotigen', 'bluotige', 'bluotic', 'bluotigez', 'bluotigem\r']

In [7]:
list(forms_to_lemmata.keys())[10000]

'ambahtent'

In [8]:
forms_to_lemmata['bluotiger']

['bluotig\r']

In [9]:
lemmata_to_pos = mhgpt.read_lemmata_to_pos()

In [10]:
lemmata_to_pos["bluotig"]

['ADJN', 'ADJA', 'ADJD\r']

In [11]:
norm_to_pos_tagger = mhgpt.read_norm_to_pos_tagger()

In [12]:
norm_to_pos_tagger["bluotic"]

['ADJD', 'ADJA\r']

In [13]:
pos_tagger = mhgpt.read_pos_tagger()

In [14]:
print(pos_tagger["ADJD"][:20])

['stèinëht', 'durh-vèrtig', 'ober', 'în-ge-tân', 'jung', 'ver-gihtig', 'èin-jærig', 'ge-hülfig', 'ge-stalt', 'hol', 'un-ge-sprochen', 'tèil-haft', 'wètte', 'bürtig', 'un-zal(e)-haft', 'wankel-müète', 'wunsch-lich', 'un-vèrtig', 'un-be-rèitet', 'sælig-lich']


## How ambiguous are MHG words?

One way to define is to calculate the mean of the number of POS tags given a word form.

In [15]:
numpy.mean([len(norm_to_pos_tagger[norm]) for norm in norm_to_pos_tagger])

1.1998466858037578

In [16]:
numpy.mean([len(lemmata_to_pos[norm]) for norm in lemmata_to_pos])

1.3157481312555428

One way to define is to calculate the mean of the number of lemmata given a word form.

In [17]:
numpy.mean([len(forms_to_lemmata[norm]) for norm in forms_to_lemmata])

1.349849947807933

## How about the diversity of vocabulary?