In [1]:
import codecs
import json
import os
import random


import spacy
from spacy.tokens import DocBin, Doc
from spacy.training import Example

In [2]:
def read_menota_annotations_for_spacy():
    l = []
    with codecs.open(os.path.join("data-menota-spacy.json"), "r", encoding="utf-8") as f:
        for book in f:
            for line in json.loads(book):
                if all([i == "-" for i in line[0]]):
                    continue
                l.append(line)
    return l

In [3]:
data = read_menota_annotations_for_spacy()

In [4]:
print(f"Number of tokens {sum([len(i[0]) for i in data])}")

Number of tokens 173164


In [5]:
unique_tokens = set()
for i in data:
    unique_tokens.update(i[0])

print(f"Number of unique tokens {len(unique_tokens)}")

Number of unique tokens 16866


In [6]:
unique_tags = set()
for i in data:
    unique_tags.update(i[1]["pos"])
# print(unique_tags)
print(f"Number of unique tags {len(unique_tags)}")

Number of unique tags 1171


In [7]:
print("Example")
print(data[0][0])
print(data[0][1]["pos"])

Example
['í', 'vígskǫrðum', 'verja', ',', 'þá', 'er', 'gott', 'at', 'gera', 'hengivígskǫ', 'af', 'léttum', 'viði']
['xAP', 'xNC gN nP cD sI', 'xVB fI tPS vA', '-', 'xAV rP', 'xVB fF tPS mIN p3 nS vA', 'xAJ rP gN nS cN sI', 'xIM', 'xVB fI tPS vA', 'xNC gN nP cA sI', 'xAP', 'xAJ rP gM nS cD sI', 'xNC gM nS cD sI']


In [8]:
def create_vocab(training_data):
    vocabulary_set = set()
    for text, _ in training_data:
        for lemma in text:
            vocabulary_set.add(lemma)
    return vocabulary_set


def reduce_tags(pos_tag: str) -> str:
    first_part = pos_tag.strip().split(" ")[0]
    if "|" in first_part:
        return reduce_tags(first_part.split("|")[0])
    if first_part.startswith("x"):
        return first_part[1:]
    elif "00000" == first_part:
        return ""
    else:
        return pos_tag

    
def reduce_word(word: str):
    #return word.replace(" ", "").lower()
    if word == "-":
        return word
    elif word == ",–":
        return "-"
    elif word == ",–":
        return ","
    else:
        return word.lower().replace(" ", "").replace("-", "").replace("'", "")


def from_text_annotations_to_spacy_training_data(data):
    l = []
    for sentence, annotation in data:
        try:
            l.append(([reduce_word(token) for token in sentence],
                      [reduce_tags(tag) for tag in annotation["pos"]]))
        except AttributeError:
            print(sentence)
            print(annotation)
    return l

In [9]:
pos_training_data = from_text_annotations_to_spacy_training_data(data)

In [10]:
unique_tags = set()
for i in pos_training_data:
    unique_tags.update(i[1])
# print(unique_tags)
print(f"Number of unique tags {len(unique_tags)}, {unique_tags}")

Number of unique tags 31, {'CS', 'AV', 'IT', 'IM', 'VB', 'RP', 'AJ', 'CU', 'DQ', 'CC', 'EX', 'AQ', 'AP', 'NP', 'NC', 'DP', 'NE', 'FN', 'PE', 'DD', 'PD', 'VP', 'NO', 'NA', 'PQ', '-', 'PR', 'AT', 'PI', 'FW', 'UA'}


In [11]:
print("Example")
print(f"{pos_training_data[0][0]}")
print(f"{pos_training_data[0][1]}")

Example
['í', 'vígskǫrðum', 'verja', ',', 'þá', 'er', 'gott', 'at', 'gera', 'hengivígskǫ', 'af', 'léttum', 'viði']
['AP', 'NC', 'VB', '-', 'AV', 'VB', 'AJ', 'IM', 'VB', 'NC', 'AP', 'AJ', 'NC']


In [12]:
def create_training(training_data):
    nlp = spacy.blank("is")
    db = DocBin()  # create a DocBin object
    text = []
    k = 0
    for j, item in enumerate(training_data):
        words = item[0]
        tags = item[1]
        try:
            example = Example.from_dict(nlp.make_doc(" ".join(words)), dict(words=words, tags=tags))
            db.add(example.reference)
            text.append(" ".join(words))
        except ValueError:
            k += 1
    print(f"{k} problematic cases")
            
    return db, " ".join(text)

In [13]:
def save_training_data(training_data):
    random.shuffle(training_data)
    valid = training_data[:500]
    train = training_data[500:]

    train, train_corpus = create_training(train)
    train.to_disk("./data/train.spacy")
    with codecs.open("./data/train_corpus.txt", "w", encoding="utf-8") as f:
        f.write(train_corpus)

    valid, valid_corpus = create_training(valid)
    valid.to_disk("./data/valid.spacy")
    with codecs.open("./data/valid_corpus.txt", "w", encoding="utf-8") as f:
        f.write(valid_corpus)

In [14]:
save_training_data(pos_training_data)

11 problematic cases
0 problematic cases


In [15]:
print(pos_training_data[0])

(['hann', 'hafði', 'þá', 'sverð', 'sitt', 'at', 'vega', 'með', 'ok', 'sótti', 'fram', 'vel', 'ok', 'hjó', 'til', 'beggja', 'handa', 'ok', 'felldi', 'margan', 'mann'], ['PE', 'VB', 'AV', 'NC', 'DP', 'IM', 'VB', 'VP', 'CC', 'VB', 'AV', 'AV', 'CC', 'VB', 'AP', 'PI', 'NC', 'CC', 'VB', 'AJ', 'NC'])


In [16]:
tags = []
for item in pos_training_data:
    labels = item[1]
    for label in labels:
        tags.append(label)
tags = list(set(tags))
with open ("tag.json", "w", encoding='utf-8') as f:
    print(tags)
    json.dump(tags, f, indent=4)

['CS', 'AV', 'IT', 'IM', 'VB', 'RP', 'CU', 'AJ', 'DQ', 'CC', 'EX', 'AQ', 'AP', 'NP', 'NC', 'DP', 'NE', 'FN', 'PE', 'DD', 'PD', 'VP', 'NO', 'NA', 'PQ', '-', 'PR', 'PI', 'AT', 'FW', 'UA']


In [17]:
!python -m spacy debug data config-v3.2.cfg --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

^C


In [18]:
!python -m spacy debug config config-v3.2.cfg --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

[1m
[1m
[1m
corpora
[+] Config is valid


In [19]:
!python -m spacy train config-v3.2.cfg --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

^C
