In [29]:
import spacy, os
import spacy_wrap
import srsly
import re
import copy
from prodigy.components.db import connect
from spacy.tokens import DocBin
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


# Load gold-full data as doc objects
db = DocBin()
full = db.from_disk("../../data/full/gold/gold-full.spacy")
nlp = spacy.blank("da")
docs = list(full.get_docs(nlp.vocab))

In [30]:

review_docs_idxs = []
for i, doc in enumerate(docs):
    # If doc contains string matching TIME spans
    if re.search("\d{1,2}:\d\d ?[-|\||\/] ?\d", doc.text) or re.search(
        "dag: \d{1,2}", doc.text
    ):
        review_docs_idxs.append(i)
        
    # If doc contains string matching DATE spans:
    if re.search("\d{2,4} ?[-|–] ?\d{2,4}", doc.text):
        review_docs_idxs.append(i)
        
    # If doc contains string matching A/S og ApS
    if re.search("ApS", doc.text) or re.search("A\/S", doc.text):
        review_docs_idxs.append(i)
        
    # If doc contains a number written with letters and it is not included as a ent already
    if re.search(" to | to$|^to| To | To$|^To| TO | TO$|^TO| tre | tre$|^tre| Tre | Tre$|^Tre| TRE | TRE$|^TRE| fire | fire$|^fire| Fire | Fire$|^Fire| FIRE | FIRE$|^FIRE| fem | fem$|^fem| Fem | Fem$|^Fem| FEM | FEM$|^FEM| seks | seks$|^seks| Seks | Seks$|^Seks| SEKS | SEKS$|^SYV| otte | otte$|^otte| Otte | Otte$|^Otte| OTTE | OTTE$|^OTTE| ni | ni$|^ni| Ni | Ni$|^Ni| NI | NI$|^NI| ti | ti$|^ti| Ti | Ti$|^Ti| TI | TI$|^TI", ent.text):
        ents_string = " ".join([str(ent) for ent in list(doc.ents)])
        if re.search(" to | to$|^to| To | To$|^To| TO | TO$|^TO| tre | tre$|^tre| Tre | Tre$|^Tre| TRE | TRE$|^TRE| fire | fire$|^fire| Fire | Fire$|^Fire| FIRE | FIRE$|^FIRE| fem | fem$|^fem| Fem | Fem$|^Fem| FEM | FEM$|^FEM| seks | seks$|^seks| Seks | Seks$|^Seks| SEKS | SEKS$|^SYV| otte | otte$|^otte| Otte | Otte$|^Otte| OTTE | OTTE$|^OTTE| ni | ni$|^ni| Ni | Ni$|^Ni| NI | NI$|^NI| ti | ti$|^ti| Ti | Ti$|^Ti| TI | TI$|^TI", ents_string):
            review_docs_idxs.append(i)
        
    for ent in doc.ents:
        # Dates with "Den" or similar
        if ent.label_ == "DATE" and re.search("^d.{0,2} \d", ent.text):
            review_docs_idxs.append(i)
        
        # Himlen as LOCATION:
        if ent.label_ == "LOCATION" and re.search("[Hh][iI][mM][lL][Ee][Nn]|[Hh][iI][mM][mM][Ee][lL][Ee][Nn]", ent.text):
            review_docs_idxs.append(i)
        
        # Gud as PERSON:
        if ent.label_ == "PERSON" and re.search("[Gg][Uu][Dd]", ent.text):
            review_docs_idxs.append(i)
        
        # Adresses as GPE:
        if ent.label_ == "GPE" and re.search(".*\d ?", ent.text):
            review_docs_idxs.append(i)
            
        # Telephone numbers wrongly tagged as Cardinal
        if ent.label_ == "CARDINAL" and (
            re.search(
                "\d{2} \d{2} \d{2} \d{2}",
                ent.text
                or re.search("\+\d{2} \d{2} ?\d{2} ?\d{2} ?\d{2}$", ent.text)
                or re.search("^\d{4} ?\d{4}$", ent.text)
                or re.search(" \d{4} ?\d{4}$", ent.text)
                or re.search("^\d{4} ?\d{4}$", ent.text),
            )
        ):
            review_docs_idxs.append(i)
            
        # Websites wrongly tagged as ORGANIZATIONS:
        if ent.label_ == "ORGANIZATION" and re.search(".dk$|.com$", ent.text):
            review_docs_idxs.append(i)
            
        # Hotels and resorts wrongly tagged as ORGANIZATION:
        if ent.label_ == "ORGANIZATION" and re.search(
            ".*[h|H]otel.*|.*[R|r]esort.*", ent.text
        ):
            review_docs_idxs.append(i)
            
        # Numbers with / or :, wrongly tagged as CARDINAL:
        if ent.label_ == "CARDINAL" and (
            re.search("\/", ent.text)
            or re.search("\:", ent.text)
            or re.search(" ", ent.text)
            or re.search("-", ent.text)
        ):
            review_docs_idxs.append(i)
            
        # Fortrydelsesret, Ophavsret, Ytringsfrihed, Menneskerettigheder, Copyright, Returret wrongly tagged as LAWS
        if ent.label_ == "LAW" and (
            re.search(
                "[C|c]opyright",
                ent.text
                or re.search("[®|©]", ent.text)
                or re.search("[R|r]eturret", ent.text)
                or re.search("[f|F]ortrydelsesret", ent.text)
                or re.search("[o|O]phavsret$", ent.text)
                or re.search("enneskeret", ent.text),
            )
        ):
            review_docs_idxs.append(i)
            

review_docs_idxs = list(set(review_docs_idxs))
print(len(review_docs_idxs))


449


In [31]:
bad_docs = [docs[idx] for idx in review_docs_idxs]

good_docs = copy.deepcopy(docs)

review_docs_idxs.sort(reverse=True)
for idx in review_docs_idxs:
    del good_docs[idx]

In [32]:
db1 = DocBin(store_user_data=True)
for doc in bad_docs:
    db1.add(doc)
db1.to_disk("../../data/full/gold/gold-bad.spacy")

db2 = DocBin(store_user_data=True)
for doc in good_docs:
    db2.add(doc)
db2.to_disk("../../data/full/gold/gold-good.spacy")

# Extra just for being able to review against something:
db3 = DocBin(store_user_data=True)
for doc in bad_docs:
    doc.ents = ()
    db3.add(doc)
db3.to_disk("../../data/full/gold/gold-bad-no-tags.spacy")
