In [2]:
import codecs
import json
import os
import random


import spacy
from spacy.tokens import DocBin, Doc
from spacy.training import Example

In [46]:
def read_menota_annotations_for_spacy():
    l = []
    with codecs.open(os.path.join("data-menota-spacy.json"), "r", encoding="utf-8") as f:
        for line in f:
            line = json.loads(line)
            l.append(line)
    return l


def create_vocab(training_data):
    vocabulary_set = set()
    for text, _ in training_data:
        for lemma in text:
            vocabulary_set.add(lemma)
    return vocabulary_set


def create_training(training_data):
    vocab = spacy.Vocab(string=create_vocab(training_data))
    nlp = spacy.blank("is")
    db = DocBin()  # create a DocBin object
    for item in training_data:
        words = item[0]
        pos = item[1]
#         example = Example.from_dict(Doc(vocab, words=words), pos)
        try:
            example = Example.from_dict(nlp.make_doc(" ".join(words)), pos)
            db.add(example.reference)
        except:
            ValueError

    return db


def save_training_data(training_data):
    random.shuffle(training_data)
    valid = training_data[:500]
    train = training_data[500:]

    train = create_training(train)
    train.to_disk("./data/train.spacy")

    valid = create_training(valid)
    valid.to_disk("./data/valid.spacy")


def reduce_tags(pos_tag: str) -> str:
    first_part = pos_tag.strip().split(" ")[0]
    if "|" in first_part:
        return reduce_tags(first_part.split("|")[0])
    if first_part.startswith("x"):
        return first_part[1:]
    elif "00000" == first_part:
        return ""
    else:
        return pos_tag


def from_text_annotations_to_spacy_training_data(data):
    l = []
    for doc in data:
        ll = []
        for sentence, annotation in doc:
            try:
                ll.append(([token.lower() for token in sentence],
                           dict(pos=[reduce_tags(tag) for tag in annotation["pos"]])))
                # lemmas=annotation["lemmas"]
            except AttributeError:
                print(sentence)
                print(annotation)
        l.extend(ll)
    return l

In [47]:
data = read_menota_annotations_for_spacy()

In [48]:
pos_training_data = from_text_annotations_to_spacy_training_data(data)

In [49]:
save_training_data(pos_training_data)

In [None]:
!python -m spacy preprocess data-menota-spacy.json

In [50]:
pos_training_data[0]

(['-', '-', '-', '-', '-', '-'], {'pos': ['CU', 'AV', 'VB', 'AV', 'NC', 'AV']})

In [52]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values

2021-09-22 16:01:33.975015: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll



[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [68]:
nlp = spacy.blank("is")
tagger = nlp.add_pipe("tagger")
tags = []
for item in pos_training_data:
    labels = item[1]["pos"]
    for label in labels:
        tagger.add_label(label)
        tags.append(label)
tags = list(set(tags))
with open ("tag.json", "w", encoding='utf-8') as f:
    json.dump(tags, f, indent=4)
nlp.to_disk("output/test")

In [71]:
!python -m spacy train output/test/config.cfg --output ./output --paths.train data/train.spacy --paths.dev data/valid.spacy

^C


2021-09-22 16:20:04.470701: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
[2021-09-22 16:20:08,523] [INFO] Set up nlp object from config
[2021-09-22 16:20:08,531] [INFO] Pipeline: ['tagger']
[2021-09-22 16:20:08,541] [INFO] Created vocabulary
[2021-09-22 16:20:08,542] [INFO] Finished initializing nlp object
[2021-09-22 16:20:16,609] [INFO] Initialized pipeline components: ['tagger']


[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tagger']
[i] Initial learn rate: 0.001
E    #       LOSS TAGGER  TAG_ACC  SCORE 
---  ------  -----------  -------  ------
  0       0         0.00     0.00    0.00
  0     200         0.00     0.00    0.00
  0     400         0.00     0.00    0.00
  0     600         0.00     0.00    0.00
  0     800         0.00     0.00    0.00
  0    1000         0.00     0.00    0.00
  0    1200         0.00     0.00    0.00
  0    1400         0.00     0.00    0.00
  1    1600         0.00     0.00    0.00
[+] Saved pipeline to output directory
output\model-last
