# Load packages

In [75]:
import dacy
import os
import json
import copy
import re
import glob
import pandas as pd
from collections import Counter
from spacy.tokens import DocBin, Doc, Span
from spacy.training.corpus import Corpus


# Defining functions


In [76]:
# Defining a function for retriveing all ents for a given doc
def retrieve_all_ents(doc, all_docs):
    ents_for_doc = []
    # Acquire list of all ents for a doc
    for i in all_docs:
        if i.text == doc.text:
            for ent in i.ents:
                ents_for_doc.append(ent)
    return ents_for_doc


# Defining a function for exploding a doc and exploding its ents
def explode_doc(doc, ents):
    ents_exploded = [
        {
            "ent": ent,
            "ent.text": ent.text,
            "ent.label_": ent.label_,
            "ent.label_and_text": ent.text + ent.label_,
        }
        for ent in ents
    ]
    return {
        "doc.text": doc.text,
        "doc": doc,
        "doc.ents": ents_exploded,
    }


# Defining function for retrieving a list with unique ents, and the count of the unique ents
def doc_ents_count(exploded_doc, match):
    unique_ents = []
    unique_ents_count = []
    
    if match == 'label_and_text':
        for ent_idx in range(len(exploded_doc["doc.ents"])):
            ent = exploded_doc["doc.ents"][ent_idx]
            # If ent is in unique_ents, unique_ents_count += 1, for same index:
            if any(
                ent["ent.label_and_text"] == unique_ent["ent.label_and_text"]
                for unique_ent in unique_ents
            ):
                # print(f"""ent "{ent["ent.text"]}" already in unique_ents""")
                unique_ent_label_and_texts = [
                    unique_ent["ent.label_and_text"] for unique_ent in unique_ents
                ]
                index_of_same_ent = unique_ent_label_and_texts.index(
                    ent["ent.label_and_text"]
                )
                unique_ents_count[index_of_same_ent] += 1
            else:
                # print(f"""ent "{ent["ent.text"]}" not already in unique_ents""")
                unique_ents.append(ent)
                unique_ents_count.append(1)
    
    else:
        for ent_idx in range(len(exploded_doc["doc.ents"])):
            ent = exploded_doc["doc.ents"][ent_idx]
            # If ent is in unique_ents, unique_ents_count += 1, for same index:
            if any(
                ent["ent.text"] == unique_ent["ent.text"]
                for unique_ent in unique_ents
            ):
                # print(f"""ent "{ent["ent.text"]}" already in unique_ents""")
                unique_ent_texts = [
                    unique_ent["ent.text"] for unique_ent in unique_ents
                ]
                index_of_same_ent = unique_ent_texts.index(
                    ent["ent.text"]
                )
                unique_ents_count[index_of_same_ent] += 1
            else:
                # print(f"""ent "{ent["ent.text"]}" not already in unique_ents""")
                unique_ents.append(ent)
                unique_ents_count.append(1)        
    return exploded_doc["doc"], unique_ents, unique_ents_count


# Define function for getting ratio of docs where ent appears
def get_ratio(doc, unique_ents, unique_ents_count, all_docs, n_raters):
    all_doc_texts = [doc.text for doc in all_docs]
    # print(all_doc_texts.count(doc.text))
    unique_ents_proportion = [i / n_raters for i in unique_ents_count]
    return doc, unique_ents, unique_ents_proportion


# Define a function for finding frequent annotations (above certain threshold)
def retrieve_freq_or_infreq_ents(
    doc, unique_ents, unique_ents_ratio, threshold=0.5, find_freq=True
):
    if find_freq == True:
        frequent_ents_for_doc = [
            unique_ent["ent"]
            for unique_ent, unique_ent_ratio in zip(unique_ents, unique_ents_ratio)
            if unique_ent_ratio > threshold
        ]
        return doc, frequent_ents_for_doc

    if find_freq == False:
        infrequent_ents_for_doc = [
            unique_ent["ent"]
            for unique_ent, unique_ent_ratio in zip(unique_ents, unique_ents_ratio)
            if unique_ent_ratio < threshold
        ]
        return doc, infrequent_ents_for_doc


# Define a function for deleting any ents in a doc that exist in the same span as a frequent ent
def del_ents_from_freq(doc, frequent_ent_for_doc):
    # Find indexes of doc.ents where either the start- or end character is the same as for the frequent entity
    idxs_of_removable_ents = [
        idx
        for idx, item in enumerate(list(doc.ents))
        if (
            item.start_char == frequent_ent_for_doc.start_char
            or item.end_char == frequent_ent_for_doc.end_char
        )
    ]
    # Remove doc.ents with those indices
    doc_ents = list(doc.ents)
    for idx in sorted(idxs_of_removable_ents, reverse=True):
        del doc_ents[idx]
    doc.ents = tuple(doc_ents)
    return doc


# Define a function for deleting any ents in a doc that exists in any of the same spans as a list of frequent ents
def del_ents_from_freq_multiple(doc, frequent_ents_for_doc):
    for frequent_ent_for_doc in frequent_ents_for_doc:
        doc = del_ents_from_freq(doc, frequent_ent_for_doc)
    return doc


# Define a function for adding a frequent entity to a doc
def add_freq_ent_to_doc(doc, frequent_ent_for_a_doc):
    new_doc_ents = doc.ents + (frequent_ent_for_a_doc,)
    doc.ents = new_doc_ents
    return doc


# Define a function for adding frequent ents in a list of ents to a doc
def add_freq_ents_to_doc(doc, frequent_ents_for_a_doc):
    for frequent_ent_for_a_doc in frequent_ents_for_a_doc:
        doc = add_freq_ent_to_doc(doc, frequent_ent_for_a_doc)
    return doc


# Define a function for finding the index of a list, in which the doc matches another doc
def get_same_doc_index(doc, list_of_docs):
    for i, e in enumerate(list_of_docs):
        if e.text == doc.text:
            return i


# Define a function for streamlining a doc in accordance with frequent_ents_for_doc and infrequent_ents_for_doc
def streamline(rater_doc, infrequent_ents_for_doc, frequent_ents_for_doc):
    r = copy.deepcopy(rater_doc)
    # Delete all entities in the doc that has the same span as the infrequent entities
    r = del_ents_from_freq_multiple(
        r, infrequent_ents_for_doc
    )
    # Delete all entities in the doc that has the same span as the frequent entities
    r = del_ents_from_freq_multiple(
        r, frequent_ents_for_doc
    )
    # Add all frequent entities to the doc
    r = add_freq_ents_to_doc(
        r, frequent_ents_for_doc
    )
    return r


# Defining a function for the entire streamlining

In [77]:
def streamline_doc_for_rater(doc, flat_list, rater_docs, thresholds, n_raters):
    # Retrieve all annotations across raters_idx_idx_idx for a doc
    ents = retrieve_all_ents(doc, flat_list)

    # Add all entities to doc, and "explode" the doc (dictionary format, with all relevant info)
    exploded_doc = explode_doc(doc, ents)

    # Get a count of all unique ents
    doc, unique_ents_full, unique_ents_count_full = doc_ents_count(exploded_doc, match = 'label_and_text')

    # Get the ratio of occurrence of all unique entities
    doc, unique_ents_full, unique_ents_ratio_full = get_ratio(
        doc, unique_ents_full, unique_ents_count_full, flat_list, n_raters
    )

    # Retrieve the entities that are frequent across all raters
    doc, frequent_ents_for_doc = retrieve_freq_or_infreq_ents(
        doc, unique_ents_full, unique_ents_ratio_full, threshold=thresholds['find_freq'], find_freq=True
    )

    # Get a count of all unique ents
    doc, unique_ents_partial, unique_ents_count_partial = doc_ents_count(exploded_doc, match = 'text')

    # Get the ratio of occurrence of all unique entities
    doc, unique_ents_partial, unique_ents_ratio_partial = get_ratio(
        doc, unique_ents_partial, unique_ents_count_partial, flat_list, n_raters
    )

    # Retrieve the entities that are infrequent across all raters
    doc, infrequent_ents_for_doc = retrieve_freq_or_infreq_ents(
        doc, unique_ents_partial, unique_ents_ratio_partial, threshold=thresholds['find_infreq'], find_freq=False
    )
    
    unique_ents_full_texts = [ent['ent.text'] for ent in unique_ents_full]
    unique_ents_partial_texts = [ent['ent.text'] for ent in unique_ents_partial]
    
    # Get index of the doc in question
    idx = get_same_doc_index(doc, rater_docs)

    print(f'Doc index for rater: {idx} \nDoc: {doc.text} \nunique_ents_full: {unique_ents_full_texts} \nunique_ents_ratio_full: {unique_ents_ratio_full} \nfrequent_ents: {frequent_ents_for_doc} \nunique_ents_partial: {unique_ents_partial_texts} \nunique_ents_ratio_partial: {unique_ents_ratio_partial} \ninfrequent_ents: {infrequent_ents_for_doc}')

    # If the doc exists in the raters data
    if idx is not None:
        # Retrieve the doc that should be streamlined
        rater_doc = copy.deepcopy(rater_docs[idx])
        print(f'doc.ents BEFORE streamlining: {rater_doc.ents}')
        # Streamline the doc
        streamlined_rater_doc = streamline(rater_doc, infrequent_ents_for_doc, frequent_ents_for_doc)
        print(f'doc.ents AFTER streamlining: {streamlined_rater_doc.ents} \n\n\n\n')
        return streamlined_rater_doc
    
    else:
        print("\n\n\n\ndoc not in rater_docs")

# Loading data

In [78]:
# Change cwd
os.chdir("/Users/emiltrencknerjessen/Desktop/priv/DANSK-gold-NER")

# Load language object
nlp = dacy.load("medium")

# List relevant data and sort by rater number
data_paths = glob.glob("./data/DANSK-multi/unprocessed/rater*/data.spacy")
data_paths.sort()
data_paths.sort(key="./data/DANSK-multi/unprocessed/rater_10/data.spacy".__eq__)

# Load in data and get rater indices (if not already loaded)
data = []
raters_idx = []
for path in data_paths:
    # Get rater indices
    rater_idx = re.search(r"\d+", path).group()
    raters_idx.append(int(rater_idx) - 1)

    # Load data
    doc_bin = DocBin().from_disk(path)
    docs = list(doc_bin.get_docs(nlp.vocab))[:20]
    data.append(docs)


### Excluding rater 2, 8 and 10

In [79]:
# Excluding rater 2, 7 and 10
indexes = [1, 7, 9]
for index in sorted(indexes, reverse=True):
    del data[index]

raters_idx = raters_idx[:7]

raters_lookup = {0: 1, 1: 3, 2: 4, 3: 5, 4: 6, 5: 7, 6: 9}


# Retrieve unique documents


In [80]:
# Get a list of all unique docs
unique_docs = []
flat_list = [item for sublist in data for item in sublist]
for doc in flat_list:
    if all(doc.text != unique_doc.text for unique_doc in unique_docs):
        unique_docs.append(copy.deepcopy(doc))
        
# Ensure that unique_docs don't already have entities
for i in unique_docs:
    i.ents = ()

# Streamlining docs

In [81]:
# Have a doc and a list of all docs (with duplicates from each rater)
# doc = unique_docs[0]
# flat_list = flat_list
# rater_docs = copy.deepcopy(data[0])
# thresholds = {'find_freq': .5, 'find_infreq': .4}

# A single doc for a single rater
#streamlined_rater_doc = streamline_doc_for_rater(doc, flat_list, rater_docs, thresholds)

# All docs for a single rater
# streamlined_docs = []
# for doc in unique_docs:
#     streamlined_docs.append(streamline_doc_for_rater(doc, flat_list, rater_docs, thresholds))

In [73]:
# All docs for all raters
flat_list = flat_list
thresholds = {'find_freq': .3, 'find_infreq': .3}
n_raters = len(raters_idx)+1

streamlined_data = []
for rater_idx in raters_idx:
    
    streamlined_docs = []
    rater_docs = copy.deepcopy(data[rater_idx])
    for doc in unique_docs:
        print(f'Current rater idx: {rater_idx}')
        print(f'Current rater: {raters_lookup[rater_idx]}')
        #if streamlined_doc_for_rater:=streamline_doc_for_rater(doc, flat_list, rater_docs, thresholds) != None:
        streamlined_doc = streamline_doc_for_rater(doc, flat_list, rater_docs, thresholds, n_raters)
        if streamlined_doc != None:
            streamlined_docs.append(streamlined_doc)
    streamlined_data.append(streamlined_docs)

Current rater idx: 0
Current rater: 1
Doc index for rater: 0 
Doc: Hvordan kan statsministeren kalde børn,der er født på danske hospitaler for indvandrerdrenge! 
unique_ents_full: ['danske', 'statsministeren', 'hospitaler', 'indvandrerdrenge'] 
unique_ents_ratio_full: [0.875, 0.25, 0.25, 0.25] 
frequent_ents: [danske] 
unique_ents_partial: ['danske', 'statsministeren', 'hospitaler', 'indvandrerdrenge'] 
unique_ents_ratio_partial: [0.875, 0.25, 0.25, 0.25] 
infrequent_ents: [statsministeren, hospitaler, indvandrerdrenge]
doc.ents BEFORE streamlining: (danske,)
doc.ents AFTER streamlining: (danske,) 




Current rater idx: 0
Current rater: 1
Doc index for rater: 1 
Doc: Det her er jo håbløst #dkpol  https://t.co/e7hAg155Gw 
unique_ents_full: ['dkpol', '#dkpol'] 
unique_ents_ratio_full: [0.125, 0.125] 
frequent_ents: [] 
unique_ents_partial: ['dkpol', '#dkpol'] 
unique_ents_ratio_partial: [0.125, 0.125] 
infrequent_ents: [dkpol, #dkpol]
doc.ents BEFORE streamlining: ()
doc.ents AFTER stre

In [74]:
# Save all streamlined docs as jsonl
for rater_idx in raters_idx:
    db = DocBin()
    savepath = f"./data/DANSK-multi/streamlined/rater_{raters_lookup[rater_idx]}/data.jsonl"
    for doc in streamlined_data[rater_idx]:
        db.add(doc)
    examples = []
    for doc in db.get_docs(nlp.vocab):
        spans = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in doc.ents]
        examples.append({"text": doc.text, "spans": spans})
    with open(savepath, 'w') as outfile:
        for entry in examples:
            json.dump(entry, outfile)
            outfile.write('\n')
            
# # Save all streamlined docs as DocBins
# for rater_idx in raters_idx:
#     db = DocBin()
#     savepath = f"./data/DANSK-multi/streamlined/rater_{raters_lookup[rater_idx]}/data.spacy"
#     for doc in streamlined_data[rater_idx]:
#         db.add(doc)
#     db.to_disk(savepath)