In [1]:
## Import library
from converter import convert2doclevel
import nltk
import pandas as pd
import json

In [2]:
## Download additional tagger module
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
## Load original dataset, get span-level labels of each annotator, and convert them to document-level labels
data_1 = convert2doclevel.convert2doclevel(r"masked_dataset/dataset_original/masked_env_id_ann1.jsonl")
data_2 = convert2doclevel.convert2doclevel(r"masked_dataset/dataset_original/masked_env_id_ann2.jsonl")
data_3 = convert2doclevel.convert2doclevel(r"masked_dataset/dataset_original/masked_env_id_ann3.jsonl")

In [4]:
## Load one of data from original dataset (annotator 1) to get 'text', 'id_shortcode', and 'topic' columns
def read_jsonl(filename):
  result = []
  with open(filename, "r", encoding="utf8") as f:
    for line in f.readlines():
      result.append(json.loads(line))
  return result
ann1 = read_jsonl(r"masked_dataset/dataset_original/masked_env_id_ann1.jsonl")
ann1 = pd.DataFrame(ann1)

In [5]:
## Concat 'ann1' with document-level labels from each annotator (data_1, data_2, data_3)
final_label = pd.concat([ann1["text"], ann1['id_shortcode'], ann1['topic'], data_1["majority"], data_2["majority"], data_3["majority"]],axis=1)
final_label.columns = ['text', 'id_shortcode', 'topic', 'majority_1', 'majority_2', 'majority_3']

## Create function to determine majority label from the three annotator labels on concatenated data
def determine_majority(row):
    labels = [row['majority_1'], row['majority_2'], row['majority_3']]
    label_counts = {label: labels.count(label) for label in set(labels)}
    max_count = max(label_counts.values())
    majority_labels = [label for label, count in label_counts.items() if count == max_count]
    
    if len(majority_labels) == 1:
        return majority_labels[0]
    else:
        return "DELETED"   
final_label['final_label'] = final_label.apply(determine_majority, axis=1)

## Save concatenated data to excel file
final_label.to_excel(r"masked_dataset/dataset_aggregated/masked_dataset_aggregated.xlsx", index=False)