## Creating final annotated corpus

### Imports

In [375]:
import pandas as pd
import numpy as np
import json
from collections import Counter

### Load the final AMT file

In [376]:
df = pd.read_csv('../annotation/batch_result_final.csv')

In [377]:
df.shape

(2370, 38)

### Pick the majority among the three annotations
- If there is no majority (i.e. all three labels are different), then we followed the system label from spaCy.

In [378]:
df_4 = df[["Input.talk_id", "Input.para_id", "Input.entity_id", "Final_label"]]

In [379]:
labels = []
three_labels = []
for i in range(len(df_4)):
    if i % 3 == 0:
        labels.append(three_labels)
        three_labels = []
        three_labels.append(df["Final_label"][i])
    else:
        three_labels.append(df["Final_label"][i])
labels.append(three_labels)
labels = labels[1:]

In [380]:
assert len(labels) == df.shape[0]/3

In [381]:
labels[:10]

[[nan, 'PERSON', nan],
 ['ORG', 'ORG', 'ORG'],
 ['LOC', 'LOC', 'LOC'],
 ['LOC', 'LOC', 'LOC'],
 ['ORG', 'ORG', nan],
 ['LOC', 'LOC', 'LOC'],
 [nan, nan, 'ORG'],
 ['PERSON', 'ORG', 'ORG'],
 ['PERSON', 'PERSON', 'PERSON'],
 ['PERSON', 'PERSON', 'PERSON']]

In [382]:
best_labels = []
for three_labels in labels:
    counts = Counter(three_labels).most_common()
    if counts[0][-1] > 1:
        best_labels.append(counts[0][0])
    else:
        best_labels.append(three_labels)

In [383]:
best_annotations = df[["Input.talk_id", "Input.para_id", "Input.entity_id", "Input.entities", "Input.label"]]
best_annotations = best_annotations.drop_duplicates(["Input.talk_id", "Input.para_id", "Input.entity_id"])

In [384]:
best_annotations["best_label"] = np.asarray(best_labels, dtype="object")

For the labels where the one best label didn't exist, we chose the system label.

In [419]:
for i in range(0, 2368, 3):
    if isinstance(best_annotations["best_label"][i], list):
        best_annotations["best_label"][i] = best_annotations["Input.label"][i]

In [421]:
assert best_annotations["best_label"][63] == "LOC"
assert best_annotations["best_label"][2223] == "LOC"

### Correct the entities in the English corpus with the best labels chosen

In [422]:
best_annotations.sort_values("Input.talk_id")

Unnamed: 0,Input.talk_id,Input.para_id,Input.entity_id,Input.entities,Input.label,best_label
24,0,0,0,Beethoven,PERSON,PERSON
36,0,0,1,Justin Bieber,PERSON,PERSON
72,1,0,13,Chaconne,PERSON,WORK_OF_ART
114,1,0,25,Middle Eastern,LOC,LOC
102,1,0,23,Northern Romanian,LOC,LOC
...,...,...,...,...,...,...
2352,176,6,8,Svalbard,ORG,LOC
2358,176,8,1,USA,LOC,LOC
2364,178,18,6,the Galapagos Islands,LOC,LOC
2361,178,17,17,Ed Wilson's,PERSON,PERSON


Read the English corpus as `en_df`

In [467]:
en_file_path = "../transcripts/en/filtered/filtered_annotated_ted_talks_en.json"
with open(en_file_path, "r", encoding="utf-8") as f:
    en_corpus = json.load(f)
en_df = pd.DataFrame(en_corpus)

First, let's collect all the entity-label combinations in `best_annotations`.

In [489]:
from collections import defaultdict
ent_labels_dict = defaultdict(list)
for row in best_annotations.iterrows():
    ent_labels_dict[row[1]["Input.entities"]].append(row[1]["best_label"])

Second, modify labels using talk_id, para_id, and entity_id

In [500]:
for row in best_annotations.iterrows():
    talk_id = row[1]["Input.talk_id"]
    para_id = row[1]["Input.para_id"]
    ent_id = row[1]["Input.entity_id"]
    label = row[1]["best_label"]
    
    en_corpus[talk_id]["text"][para_id]["ents"][ent_id]["label"] = label

Third, modify all labels of `FAC` to `ORG`, and `GPE` to `LOC`, and update all labels of entities in `ent_labels_dict` with the new label.
- We skipped entities having more than two labels.

In [501]:
for i, talk in enumerate(en_corpus):
    for k, para in enumerate(talk["text"]):
        for j, ent in enumerate(para["ents"]):
            if ent["label"] == "GPE":
                en_corpus[i]["text"][k]["ents"][j]["label"] = "LOC"
            if ent["label"] == "FAC":
                en_corpus[i]["text"][k]["ents"][j]["label"] = "ORG"
            if ent["text"] in ent_labels_dict.keys():
                if len(ent_labels_dict[ent["text"]]) == 1:
                    en_corpus[i]["text"][k]["ents"][j]["label"] = ent_labels_dict[ent["text"]][0]

Make sure all labels are corrected.

In [502]:
assert en_corpus[5]["text"][3]["ents"][-1]["label"] == "LOC"
assert en_corpus[1]["text"][0]["ents"][13]["label"] == "WORK_OF_ART"
assert en_corpus[37]["text"][26]["ents"][1]["label"] == "ORG"
assert en_corpus[318]["text"][29]["ents"][2]["label"] == "EVENT"

Lastly, let's write the json file

In [520]:
def write_json(corpus, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(corpus, json_file,  indent=4, separators=(',', ':'))

In [521]:
file_path = "../transcripts/en/final/final_annotations.json"

In [522]:
write_json(en_corpus, file_path)