# Load packages

In [1]:
import dacy
import os
import json
import copy
import re
import glob
import pandas as pd
from collections import Counter
from spacy.tokens import DocBin, Doc, Span
from spacy.training.corpus import Corpus
from itertools import combinations
from utils import *


# Load data

In [2]:
# Change cwd
os.chdir("/Users/emiltrencknerjessen/Desktop/priv/DANSK-gold-NER")

# Load language object
nlp = spacy.blank("da")

# List relevant data and sort by rater number
data_paths = glob.glob("./data/multi/unprocessed/rater*/data.spacy")
data_paths.sort()
data_paths.sort(key="./data/multi/unprocessed/rater_10/data.spacy".__eq__)

# Load in data and get rater indices (if not already loaded)
data = []
raters_idx = []
for path in data_paths:
    # Get rater indices
    rater_idx = re.search(r"\d+", path).group()
    raters_idx.append(int(rater_idx) - 1)

    # Load data
    doc_bin = DocBin().from_disk(path)
    docs = list(doc_bin.get_docs(nlp.vocab))[:20]
    data.append(docs)



# Excluding rater 2, 8 and 10


In [3]:
# Define indexes for raters to exclude
indexes = [1, 7, 9]

# For each index in reverse order, remove them from the data
for index in sorted(indexes, reverse=True):
    del data[index]

# Update raters indexes
raters_idx = raters_idx[:7]

# Have a lookup table for the index vs. rater number.
raters_lookup = {0: 1, 1: 3, 2: 4, 3: 5, 4: 6, 5: 7, 6: 9}

# Keys = index
# Value = rater

# Retrieve unique documents and flat list with all docs from all raters

In [4]:
# Get a list with all docs from all raters (including duplicate docs)
all_docs = [item for sublist in data for item in sublist]

# Get a list of all unique docs
unique_docs = []
for doc in all_docs:
    if all(doc.text != unique_doc.text for unique_doc in unique_docs):
        unique_docs.append(copy.deepcopy(doc))

# Ensure that unique_docs don't already have entities
for i in unique_docs:
    i.ents = ()

# Streamline all docs for all raters

In [5]:
threshold_freq = .1
threshold_infreq = .075
n_raters = len(raters_idx)

streamlined_data = []

for rater_idx in raters_idx:
    streamlined_rater_docs = []
    rater_docs = copy.deepcopy(data[rater_idx])
    for doc in unique_docs:
        if get_same_doc_index(doc, rater_docs) is None: 
            print('Doc does not exist for rater')
        else:
            unique_ents_full_match,unique_ents_partial_match,freq_unique_ents_full_match,infreq_unique_ents_partial_match,unique_ents_full_match_ratio,unique_ents_partial_match_ratio = retrieve_freq_and_infreq_ents_from_doc(doc, all_docs, threshold_freq, threshold_infreq)
            streamlined_doc = streamline_doc(doc, rater_docs, freq_unique_ents_full_match, infreq_unique_ents_partial_match)
            print(f'Current rater: {raters_lookup[rater_idx]}')
            print(f'Current rater_idx: {rater_idx}')
            
            print(f'Current doc index in rater: {get_same_doc_index(doc, rater_docs)}')
            print(f'Current doc: {doc}')
            
            print(f'Unique_ents_full: {unique_ents_full_match}')
            print(f'Unique_ents_full ratio: {unique_ents_full_match_ratio}')
            print(f'Freq ents (no duplicates): {freq_unique_ents_full_match}')
            
            print(f'Unique_ents_partial: {unique_ents_partial_match}')
            print(f'Unique_ents_partial ratio: {unique_ents_partial_match_ratio}')
            print(f'Infreq ents (no overlaps): {infreq_unique_ents_partial_match}')
            
            print(f'Current doc ents PRIOR to streamlining: {rater_docs[get_same_doc_index(doc, rater_docs)].ents}')
            print(f'Current doc ents AFTER streamlining: {streamlined_doc.ents}')
        print('\n\n\n')

        if streamlined_doc != None:
            streamlined_rater_docs.append(streamlined_doc)
    streamlined_data.append(streamlined_rater_docs)


Current rater: 1
Current rater_idx: 0
Current doc index in rater: 0
Current doc: Hvordan kan statsministeren kalde børn,der er født på danske hospitaler for indvandrerdrenge!
Unique_ents_full: [danske, statsministeren, hospitaler, indvandrerdrenge]
Unique_ents_full ratio: [1.0, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857]
Freq ents (no duplicates): [danske, statsministeren, hospitaler, indvandrerdrenge]
Unique_ents_partial: [danske, statsministeren, hospitaler, indvandrerdrenge]
Unique_ents_partial ratio: [1.0, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857]
Infreq ents (no overlaps): []
Current doc ents PRIOR to streamlining: (danske,)
Current doc ents AFTER streamlining: (statsministeren, danske, hospitaler, indvandrerdrenge)




Current rater: 1
Current rater_idx: 0
Current doc index in rater: 1
Current doc: Det her er jo håbløst #dkpol  https://t.co/e7hAg155Gw
Unique_ents_full: [dkpol, #dkpol]
Unique_ents_full ratio: [0.14285714285714285, 0.14285714285714

# Save all streamlined docs as DocBins (jsonl??)

In [6]:
# Save all streamlined docs as .jsonl
for rater_idx in raters_idx:
    db = DocBin()
    #savepath = f"./data/multi/streamlined/rater_{raters_lookup[rater_idx]}/train.jsonl"
    savepath = f"./data/multi/streamlined/rater_{raters_lookup[rater_idx]}/train.spacy"
    for doc in streamlined_data[rater_idx]:
        db.add(doc)
    db.to_disk(savepath)
    # examples = []
    # for doc in db.get_docs(nlp.vocab):
    #     spans = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in doc.ents]
    #     examples.append({"text":doc.text,"spans":spans,"_view_id": "ner_manual"})
    # with open(savepath, 'w') as outfile:
    #     for entry in examples:
    #         json.dump(entry, outfile)
    #         outfile.write('\n')