### Check label counts per domain
For deciding on unique entities

In [1]:
import json
from collections import Counter

In [2]:
types = ["train", "dev", "test"]
domains=['ai', 'literature', 'music', 'news', 'politics', 'science']
counts_per_domain = {}
data_path = "../data/crossre_data/"
for d in domains:
    counts_per_domain[d] = {}
    for t in types:
        counts_per_domain[d][t] = Counter()
        with open(f"{data_path}{d}-{t}.json") as f:
            for json_elem in f:
                document = json.loads(json_elem)
                for ner in document["ner"]:
                    counts_per_domain[d][t].update([ner[2]])

print(json.dumps(counts_per_domain, indent=4))

{
    "ai": {
        "train": {
            "product": 57,
            "field": 39,
            "task": 59,
            "researcher": 54,
            "university": 27,
            "programlang": 25,
            "algorithm": 80,
            "misc": 45,
            "metrics": 27,
            "organisation": 50,
            "conference": 24,
            "country": 29,
            "location": 10,
            "person": 6
        },
        "dev": {
            "metrics": 146,
            "algorithm": 178,
            "misc": 186,
            "person": 41,
            "product": 174,
            "organisation": 95,
            "task": 146,
            "university": 59,
            "conference": 83,
            "researcher": 142,
            "field": 177,
            "programlang": 34,
            "country": 43,
            "location": 42,
            "academicjournal": 2
        },
        "test": {
            "algorithm": 179,
            "organisation": 148,
            "conference": 86,

In [3]:
counts_per_label = {}
for domain in domains:
    for data_type in types:
        for label, count in dict(counts_per_domain[domain][data_type]).items():
            if label in counts_per_label:
                if domain in counts_per_label[label]:
                    counts_per_label[label][domain][data_type] = count
                else:
                    counts_per_label[label][domain] = {data_type: count}
            else:
                counts_per_label[label] = {domain: {data_type: count}}

print(json.dumps(counts_per_label, indent=4))

{
    "product": {
        "ai": {
            "train": 57,
            "dev": 174,
            "test": 198
        }
    },
    "field": {
        "ai": {
            "train": 39,
            "dev": 177,
            "test": 205
        }
    },
    "task": {
        "ai": {
            "train": 59,
            "dev": 146,
            "test": 219
        }
    },
    "researcher": {
        "ai": {
            "train": 54,
            "dev": 142,
            "test": 158
        }
    },
    "university": {
        "ai": {
            "train": 27,
            "dev": 59,
            "test": 28
        },
        "literature": {
            "test": 1
        },
        "politics": {
            "test": 51
        },
        "science": {
            "train": 31,
            "dev": 110,
            "test": 87
        }
    },
    "programlang": {
        "ai": {
            "train": 25,
            "dev": 34,
            "test": 60
        },
        "literature": {
            "dev": 4
   