# Re-annotation of the Zoëga's dictionary

### Configuration

Install the modules.
```bash
$ pip3 install -r requirements.txt
```

Install the **kernel** associated with **python3.6** [https://ipython.readthedocs.io/en/stable/install/kernel_install.html](https://ipython.readthedocs.io/en/stable/install/kernel_install.html) 

In [1]:
import zoegas
help(zoegas)

Help on package zoegas:

NAME
    zoegas

PACKAGE CONTENTS
    constants
    reader
    tests
    utils

FILE
    /home/clementbesnier/.virtualenvs/old_norse_notebook/src/zoegas/zoegas/__init__.py




In [2]:
from zoegas import reader
from zoegas.constants import abbreviations, pos_verbose, postags

In [3]:
dictionary = reader.Dictionary(reader.dictionary_name)
dictionary.get_entries()

In [4]:
postags

defaultdict(str,
            {'a.': 'lkensf',
             'acc.': 'o',
             'adv.': 'a',
             'card. numb.': 'ta',
             'compar.': 'm',
             'conj.': 'c',
             'dat.': 'þ',
             'def. art.': 'gken',
             'dem. pron.': 'demonstrative pronoun',
             'f.': 'nven',
             'for.': 'e',
             'fem.': 'v',
             'gen.': 'e',
             'imperat.': 'sb',
             'impers.': 'impersonal',
             'indecl.': 'indeclinable',
             'indef. pron.': 'fo',
             'infin.': 'sn',
             'int. pron.': 'fs',
             'interj.': 'interjection',
             'm.': 'nken',
             'masc.': 'k',
             'n.': 'nhen',
             'neut.': 'h',
             'nom.': 'n',
             'ord. numb.': 'to',
             'pers. pron.': 'fp',
             'pl.': 'f',
             'poss. pron.': 'fe',
             'pp.': 'sþ',
             'pr. p.': 'se',
             'prep.': 'a[oþe}',
  

In [5]:
word = dictionary.find("heimr")
print(word.pos)
print(word.word)
print(word.description)
print(word.translations)
print(word.references)
print(word.raw.decode("utf-8"))

['masculine noun']
heimr


(-s, -ar), m.

1) a place of abode, a region or world (níu man ek heima); spyrja e-n í hvern heim, to ask one freely;

2) this world (segðu mér ór heimi, ek man ór helju); koma í heiminn, to be born; fara af heiminum, to depart this life; liggja milli heims ok heljar, to lie between life and death;

3) the earth; kringla heimsins, the globe.


['a place of abode, a region or world', 'to ask one freely', 'this world', 'to be born', 'to depart this life', 'to lie between life and death', 'the earth', 'the globe']
[]
<entry word="heimr">

	<m1>(-s, -ar), <p>m.</p></m1>

	<m2>1) <i><trn>a place of abode, a region or world</trn></i> (n&#237;u man ek heima); spyrja <p>e-n</p> &#237; hvern heim, <i><trn>to ask one freely</trn></i>;</m2>

	<m2>2) <i><trn>this world</trn></i> (seg&#240;u m&#233;r &#243;r heimi, ek man &#243;r helju); koma &#237; heiminn, <i><trn>to be born</trn></i>; fara af heiminum, <i><trn>to depart this life</trn></i>; liggja milli heims ok heljar

In [6]:
from lxml import etree
parser = etree.XMLParser(load_dtd=True, no_network=False)
# help(parser)

In [7]:
from zoegas.constants import postags, pos_verbose

In [8]:
pos_verbose_inverted = {pos_verbose[key]: key for key in pos_verbose}

In [9]:
import re
# def extract_parts(text):
    
def extract_category(text):
    p = re.compile(r"\(.+?\), (?P<category>\w+)\.")
    m = p.search(text)
    if m:
        return m.group("category")
    else:
        return text

def extract_subentries(text):
    subentries = [subentry for subentry in re.split(r'[IVX]+\)', text) if subentry.strip()]
    return subentries

def extract_group_in_parentheses(text):
    p = re.compile(r'\((?P<first>.+?)\)')
    m = p.search(text)
    if m is not None:
#         print(m.group("first"))
        return m.group("first")
    else:
        return text

def strip_ending(text: str, word_category: str) -> str:
    if word_category in ["m", "f", "n"]:
        if text.endswith(text, "i"):
            return text[:-1]
        elif text.endswith():
            return text
    elif word_category in ["v. refl."]:
        if text.endswith(text, "st"):
            return text[:-2]
    return text

def extract_weak_verb_forms(text):
    p = re.compile(r'\((?P<sfg3en>\w+), (?P<sfg3et>\w+), (?P<stken>\w+)\)')
    m = p.search(description)
#     print(m.group("sfg3en"))
#     print(m.group("sfg3et"))
#     print(m.group("stken"))
    return {"sfg3en": m.group("sfg3en").strip(), "sgg3et": m.group("sfg3et").strip(), "stken": m.group("stken").strip()}



def extract_sub_element(text):
#     element_extractor_ending = re.compile(r'(?P<subcategory>\w+)\. -(?P<ending>\w+)')
    element_extractor = re.compile(r'(?P<subcategory>\w+)\. (?P<form>[\w\- ]+)')
    m = element_extractor.search(text)
    if m is not None:
        l = (m.group("subcategory"), analyse_given_form(m.group("form")))
#         print(m.group("subcategory"))
#         print(m.group("form"))
    else:
#         print("text : "+text)
        form_extractor = re.compile(r'(?P<form>[\w\- ]+)')
        m = form_extractor.search(text)
        l = [m.group("form")]
    return l
        
        
def extract_list_from_parentheses(text):
    l = []
    singular_plural = text.split(";")
    if len(singular_plural) > 1:
        for i in singular_plural:
            for element in i.split(","):
                l.append(extract_sub_element(element))
    else:
        
        forms = text.replace(",", "").split(" ")
#         print("forms:", forms)
        if "later" in forms:
            before, after = text.split("later")
            l_before = []
            l_after = []
            for element in before.split(","):
                if element.strip():
                    l_before.append(extract_sub_element(element))
            for element in after.split(","):
                if element.strip():
                    l_after.append(extract_sub_element(element))
            l = {"before": l_before, "after": l_after}
        else:
#             print(text)
            for element in text.split(","):
                if element.strip():
                    l.append(extract_sub_element(element))
    return l


def analyse_given_form(text):
    l = []
    text = text.replace(",", "")
    words = text.split(" ")
    
#     print(text)
    if "and" in words:
        for i in text.split("and"):
            print(i)
            l.append({"complete": i.strip()})
    elif "later," in words:
        before_after = text.split("later")
        before = before_after[0]
        nee, nfn = before.split(" ")
        nee2, nfn2 = before_after[1].split(" ")
        l.append({})
        
    elif text[0] == "-":
        l.append({"ending": text[0:]})
    else:
        l.append({"complete": text})
    return l

def display_entry_processing(entry):
    if entry is not None:
        description = entry.description.strip()
        print("-------------------------------------------------")
        print(description)
        print("-------------------------------------------------")
        first_group = extract_group_in_parentheses(description)
        if first_group == description:
            subentries = extract_subentries(description)
            l = []
            for subentry in subentries:
                l.append(subentry.split(";")[0])
                print()
            print(l)
        else:
            l = extract_list_from_parentheses(first_group)
            print(l)
    else:
        print(word.capitalize() + " is not in the dictionary")

In [10]:
def apply_suffix(suffixes, word, pos):
    l = []
    for suffix in suffixes.split(","):
        ssuffix = suffix.strip()
        if "-" == ssuffix[0]:
            if pos == "s":
                if word[-2] == ssuffix[1]:
                    l.append(word[:-1]+ssuffix[2:])
                else:
                    l.append(word[:-1]+ssuffix[1:])
            else:
                if word[-1] == ssuffix[1]:
                    l.append(word+ssuffix[2:])
                else:
                    l.append(word+ssuffix[1:])
    return l

apply_suffix("-tta, -ttr", "umbreyta", "s")
        

['umbreytta', 'umbreyttr']

In [11]:
a = "abcd"
a[:-1]

'abc'

In [12]:
def from_dictionary_view_to_good_annotation(parser, entry: reader.Entry):
    xml_view = entry.raw.decode("utf-8")
    tree = etree.fromstring(xml_view)
    print(tree)
    print(tree.get("word"))
    if len(entry.pos) == 1:
        tree.set("pos", postags[pos_verbose_inverted[entry.pos[0]]])
        print(tree.get("pos"))
    
    new_element = etree.SubElement(tree, "form")
    new_element.set("pos", "nkee")
    new_element.text = "heims"
    new_element = etree.SubElement(tree, "form")
    new_element.set("pos", "nkfn")
    new_element.text = "heimar"
    
    print(etree.tostring(tree, pretty_print=True).decode("utf-8"))
    
    

In [13]:
from_dictionary_view_to_good_annotation(parser, word)
print("=============================================")
print(extract_category(word.description))
print(extract_subentries(word.description))

<Element entry at 0x7fa3161debc8>
heimr
nken
<entry word="heimr" pos="nken">

	<m1>(-s, -ar), <p>m.</p></m1>

	<m2>1) <i><trn>a place of abode, a region or world</trn></i> (n&#237;u man ek heima); spyrja <p>e-n</p> &#237; hvern heim, <i><trn>to ask one freely</trn></i>;</m2>

	<m2>2) <i><trn>this world</trn></i> (seg&#240;u m&#233;r &#243;r heimi, ek man &#243;r helju); koma &#237; heiminn, <i><trn>to be born</trn></i>; fara af heiminum, <i><trn>to depart this life</trn></i>; liggja milli heims ok heljar, <i><trn>to lie between life and death</trn></i>;</m2>

	<m2>3) <i><trn>the earth</trn></i>; kringla heimsins, <i><trn>the globe</trn></i>.</m2>

<form pos="nkee">heims</form><form pos="nkfn">heimar</form></entry>

m
['\n\n(-s, -ar), m.\n\n1) a place of abode, a region or world (níu man ek heima); spyrja e-n í hvern heim, to ask one freely;\n\n2) this world (segðu mér ór heimi, ek man ór helju); koma í heiminn, to be born; fara af heiminum, to depart this life; liggja milli heims ok he

In [14]:
display_entry_processing(word)

-------------------------------------------------
(-s, -ar), m.

1) a place of abode, a region or world (níu man ek heima); spyrja e-n í hvern heim, to ask one freely;

2) this world (segðu mér ór heimi, ek man ór helju); koma í heiminn, to be born; fara af heiminum, to depart this life; liggja milli heims ok heljar, to lie between life and death;

3) the earth; kringla heimsins, the globe.
-------------------------------------------------
[['-s'], [' -ar']]


In [15]:
def count_words_by_letter(letter):
    return len([entry.word for entry in dictionary.find_beginning_with(letter)])

In [16]:
count_words_by_letter("u")

349

In [17]:
l = []
for entry in dictionary.find_beginning_with("u"):
#     print(entry)
#     print(repr(entry.description))
#     print(entry.word)
#     print(entry.raw.decode("utf-8"))
    description = entry.description.strip()
    if description:
        category_extracted_1 = extract_category(description)
        if category_extracted_1 != description:
            first_line = description.split("\n")[0]
            l.append([postags[category_extracted_1+"."], entry.word, first_line, extract_group_in_parentheses(first_line), 
                      apply_suffix(extract_group_in_parentheses(first_line), entry.word, postags[category_extracted_1+"."])])
            continue
        pos = description.split(" ")[0]
        if pos in postags:
            first_line = description.split("\n")[0]
#             l.append("")
            l.append([postags[pos], entry.word, description, 
                      apply_suffix(extract_group_in_parentheses(first_line), entry.word, postags[category_extracted_1+"."])])

In [18]:
for i in l:
    if len(i) > 4:
        print("===>>>>", i)
    else:
        print(i)

['nven', 'umskipting', 'f. change.', []]
['nhen', 'umkeypi', 'n. exchange, barter.', []]
['lkensf', 'undrligr', 'a. wonderful, strange.', []]
['lkensf', 'uppburðarlítill', 'a. shy, timid.', []]
['nven', 'uppgreizla', 'f. payment, discharge.', []]
['nven', 'umsniðning', 'f. circumcision.', []]
['nven', 'umhugsan', 'f. reflection.', []]
['nven', 'upptendran', 'f. kindling.', []]
['nken', 'undirbiskup', 'm. suffragan bishop.', []]
['lkensf', 'undirhyggjusamr', 'a. guileful.', []]
['nhen', 'uxahöfuð', 'n. head of an ox.', []]
['nken', 'umlesmaðr', 'm. slanderer.', []]
['nken', 'upphlaupsmaðr', 'm. rioter.', []]
['lkensf', 'undranarverðr', 'a. wonderful.', []]
===>>>> ['s', 'ugga', '(-ða, -at), v.', '-ða, -at', ['uggða', 'uggat']]
['lkensf', 'ugglauss', 'a. fearless, unconcerned.', []]
['a', 'ugglaust', 'adv. no doubt, undoubtedly.', []]
['nken', 'uggr', 'm. fear, apprehension; ~ er mér á, at = mik uggir, at.', []]
['lkensf', 'uggsamligr', 'a. to be feared.', []]
['lkensf', 'uggviss', 'a. d