In [1]:
import csv


import latechclfl2020.dlh.data as dlh_data
import latechclfl2020.volsunga.data as vol_data
import latechclfl2020.nibelungenlied.data as nib_data

from latechclfl2020.models.texts import Work

from enum import Enum

class NERCategory(Enum):
    person = "PERSON"
    place = "PLACE"
    group = "GROUP"

In [2]:
dlh_person_names = dlh_data.get_lemmatised_dlh_person_names()
vol_person_names = vol_data.get_volsunga_annotated_names()
nib_person_names = nib_data.get_nibelungenlied_annotated_names()


dlh_place_names = dlh_data.get_lemmatised_dlh_place_names()
vol_place_names = vol_data.get_volsunga_annotated_places()
nib_place_names = nib_data.get_nibelungenlied_annotated_places()


vol_group_names = vol_data.get_volsunga_annotated_groups()
nib_group_names = nib_data.get_nibelungenlied_annotated_groups()

In [3]:
def writerow(dict_writer, work: Work, category: NERCategory, lemma_to_tokens):
    for lemma in lemma_to_tokens:
        dict_writer.writerow(dict(text=work.value, category=category.value,
                         language=work.get_language().name, lemma=lemma,
                         tokens=":".join(lemma_to_tokens[lemma])))

fields = ["text", "category", "language", "lemma", "tokens"]

In [4]:
def write_dataset(filename):
    with open(filename, "w", encoding="utf-8") as f:
        dw = csv.DictWriter(f, fields, delimiter=",")
        # person
        writerow(dw, Work.DLH, NERCategory.person, dlh_person_names)
        writerow(dw, Work.VOL, NERCategory.person, vol_person_names)
        writerow(dw, Work.NIB, NERCategory.person, nib_person_names)
        # place
        writerow(dw, Work.DLH, NERCategory.place, dlh_place_names)
        writerow(dw, Work.VOL, NERCategory.place, vol_place_names)
        writerow(dw, Work.NIB, NERCategory.place, nib_place_names)
        # group
        writerow(dw, Work.VOL, NERCategory.group, vol_group_names)
        writerow(dw, Work.NIB, NERCategory.group, nib_group_names)

In [5]:
def read_dataset(filename):
    with open(filename, "r", encoding="utf-8") as f:
        dr = csv.DictReader(f, fields)
        for row in dr:
            row["tokens"] = row["tokens"].split(":")
            yield row

In [6]:
dataset_filename = "ner-dataset.csv"

In [7]:
write_dataset(dataset_filename)

In [8]:
rd = read_dataset(dataset_filename)

In [9]:
rd.__next__()

{'text': 'Decem Libri Historiarum',
 'category': 'PERSON',
 'language': 'latin',
 'lemma': 'Aaron',
 'tokens': ['Aaron']}