# Prepare data for ELSA evaluation
After the data and annotatoins has been downloaded, we load a NER tagger, select the sample documents, and join the information into a dataframe with what we need for further analysis


In [44]:
# Load my Norec-fine version
import json
import os
import time
from collections import Counter, defaultdict
import numpy as np
import random
import pandas as pd
import torch
from helpers import *
from transformers import  pipeline
from tqdm import tqdm


In [45]:
# Link to the data and create folder structure. Folder "data" should already exist


save_root = "outputs/"
tabular_savefolder = os.path.join(save_root, "tabular")
os.makedirs(tabular_savefolder, exist_ok=True)
printout_folder = os.path.join(save_root, "printouts")
os.makedirs(printout_folder, exist_ok=True)
data_cache = os.path.join(tabular_savefolder, "data_sentencewise.json")
train_path = "norec_tsa/tsa_conll/train.conll" # Norec fine annotations in json version
nrf_sents = conll_to_sents(train_path)
meta_path = "norec/data/metadata.json"
sentence_pol_dataset_path = "norec_sentence/3class/train.json"
model_path = 'saattrupdan/nbailab-base-ner-scandi'


In [46]:
# train_ids = np.random.choice(df[df["folder"]=="train"]["file_id"].unique(),50, replace=False)
# print(json.dumps(list(train_ids)))
inspect_ids = ["300040", "107011", "201849", "301323", "106679", "109778", "004340", "102785", "105949", "109227", "101882", "601171", "107972", "103164", "003939", "702913", "201734", "300178", "003717", "600774", "500437", "704907", "200937", "109021", "501037", "302181", "702152", "103447", "702956", "100866", "202792", "602054", "302663", "201470", "004230", "110613", "702761", "001061", "300746", "102095", "304135", "100120", "105165", "501319", "500921", "305169", "200607", "108264", "111035", "107563"]
with open(os.path.join(tabular_savefolder, "50docids.json"), "w") as wf:
    json.dump(inspect_ids, wf)

In [47]:
if not os.path.exists(data_cache):
    # split header in doc_id and sent_id
    for s in nrf_sents:
        s["sent_id"] = s["header"].split("=")[-1]
        s["doc_id"] = s["sent_id"].split("-")[0]
        s["text"] = " ".join(s["tokens"])

    # Add more details about the tokens and the tsa targets
    for s in nrf_sents:
        token_details = token_data(s["text"])
        # Rebuild target spans
        targets = [] # list of dicts, one for each target
        new_target = {}
        for t_details, tag in zip(token_details,s["tags"]):
            if tag.startswith("B"):
                if len(new_target) > 0:
                    targets.append(new_target)
                new_target = {"polarity": tag.split("-")[-1], 
                            "start_idx" : t_details["start"], 
                            "end_idx": t_details["end"], 
                            "text":  t_details["token"] }
            if tag.startswith("I"):
                new_target["end_idx"] = t_details["end"]
                new_target ["text"] += " "+t_details["token"]
            if tag == "O":
                if len(new_target) > 0:
                    targets.append(new_target)
                new_target = {}
        if len(new_target) > 0:
            targets.append(new_target)      
        s["tsa_details"] = targets

    dataset = [s for s in nrf_sents if s["doc_id"] in inspect_ids]
    with open(data_cache, "w", encoding = "utf-8") as wf:
        json.dump(dataset, wf, ensure_ascii=False)

    

    has_something = [s for s in dataset if len(s["tsa_details"]) > 1 ] #and s["header"] == '#sent_id=003889-03-02']
    #  'header': '#sent_id=003889-03-02',
    # random.choice(has_something)


## Inspect documents
Now, we will pick some documents and check them:
- How well are PER and ORG sentiment targets identified with NER?
- How well does the sentiment expressed towards the NE as target correspond with the sentiment expressed towards the entity as a whole in the text?
- Would Coreference resolution improve the sentiment aggregation?

## Define the NER model
After experimenting with different aggregation strategies for how a named entity is labeled based og the subword classifications, we chose "first".
Since the NER tagging takes a while, we reuse the data if stored in the json file.

### Get raw NER taggings for each sentence


In [48]:
ner_first = None # To save run time, only define if needed
with open(data_cache, encoding = "utf-8") as rf:
    dataset = json.load(rf)

if not all(["ner_firsts" in sent for sent in dataset]):
    ner_first = pipeline(task='ner', 
    model= model_path, 
    aggregation_strategy='first')

for sent in tqdm(dataset):
    if not "ner_firsts" in sent:
        # Have not registered NER data for this before
        sent["ner_firsts"] = pred_ranges(ner_first(sent["text"]))
        
with open(data_cache, "w", encoding = "utf-8") as wf:
    json.dump(dataset, wf, ensure_ascii=False)

nrf_sents = None #Reopen next time

100%|██████████| 1345/1345 [01:54<00:00, 11.79it/s]


## Add document rating and sentence sentiment polarity to each sentence

In [49]:
with open (meta_path) as rf:
    meta = json.load(rf)
with open(sentence_pol_dataset_path, encoding = "utf-8") as rf:
    sentence_pol_dataset = json.load(rf)


doc_ratings = {doc_id: str(meta[doc_id]["rating"] ) for doc_id in inspect_ids}
doc_categories = {doc_id: str(meta[doc_id]["category"] ) for doc_id in inspect_ids}
sentence_polarities = { e['sent_id']: e['label'] for e in sentence_pol_dataset if e["sent_id"].split("-")[0] in inspect_ids}

# Add doc and sent polarity to each sentence
for sent in dataset:
    sent["doc_rating"] = doc_ratings[sent["doc_id"]]
    sent["doc_category"] = doc_categories[sent["doc_id"]]
    sent ["sentence_pol"] = sentence_polarities.get(sent["sent_id"], "Mixed") #Mixed polarity sents are ommitted in the dataset

print("Sentence polarities:", Counter([s["sentence_pol"] for s in dataset]).most_common())
print("Sentence count per Document polarities:", Counter([s["doc_rating"] for s in dataset]).most_common())

# Add filtered named entities and record any TSA overlap
for sent in dataset:
    sent_nes = [ne for ne in sent["ner_firsts"] if ne["tag"] in ["PER", "ORG"]].copy()

    for ne in sent_nes:
        ne["tsa_pol"] = ""
        ne_range = set(range(ne["start"], ne["end"]))
        for tsa in  sent["tsa_details"]:
            if ne_range.intersection(range(tsa["start_idx"], tsa["end_idx"])):
                ne["tsa_pol"] = tsa["polarity"]
    sent["nes"] = sent_nes


with open(data_cache, "w", encoding = "utf-8") as wf:
    json.dump(dataset, wf, ensure_ascii=False)
    

Sentence polarities: [('Neutral', 654), ('Positive', 389), ('Negative', 213), ('Mixed', 89)]
Sentence count per Document polarities: [('5', 586), ('4', 399), ('2', 121), ('6', 109), ('3', 98), ('1', 32)]


### Print the documents that are used as a help for inspection and manual annotation of each volitional entity in the dataset.

In [50]:
# data_cache has target conll tags, and raw output for the NER tagging
# The NER tagging may have re-tokenized do that the token indices are not the same anymore
# Therefore we in stead use the start and end character indices, as that is what the NER output has

with open(data_cache, encoding = "utf-8") as rf:
    dataset = json.load(rf)

for doc_id in tqdm(inspect_ids):
    doc_raw = [s for s in dataset if s["doc_id"] == doc_id ]
    doc_text = "\n".join([s["text"] for s in doc_raw])
    tsa_spans = [tsa for sent in doc_raw for tsa in sent["tsa_details"] ]
    tsa_strings = [ str(tsa["polarity"])+" "+tsa["text"] for tsa in tsa_spans]
    # tsa_strings.sort()

    print_text = "\n".join(tsa_strings)+"\n" +"\n".join([str(n) for n in nes])+"\n"+ doc_text 
    with open ( os.path.join(printout_folder, doc_id+"_print_demo.txt"), "w", encoding = "utf-8") as wf:
        wf.write(print_text)

print_text

100%|██████████| 50/50 [00:00<00:00, 2854.01it/s]


"Negative filmen\nPositive Samspilet\nPositive Robert De Niro\nPositive Dakota Fanning\nPositive De Niro\n{'start': 7, 'end': 18, 'tag': 'PER', 'text': 'John Polson', 'tsa_pol': ''}\n{'start': 4, 'end': 18, 'tag': 'PER', 'text': 'Robert De Niro', 'tsa_pol': ''}\n{'start': 21, 'end': 35, 'tag': 'PER', 'text': 'Dakota Fanning', 'tsa_pol': ''}\n{'start': 38, 'end': 51, 'tag': 'PER', 'text': 'Famke Janssen', 'tsa_pol': ''}\n{'start': 54, 'end': 68, 'tag': 'PER', 'text': 'Elisabeth Shue', 'tsa_pol': ''}\n{'start': 71, 'end': 81, 'tag': 'PER', 'text': 'Amy Irving', 'tsa_pol': ''}\n{'start': 56, 'end': 70, 'tag': 'PER', 'text': 'Robert De Niro', 'tsa_pol': 'Positive'}\n{'start': 116, 'end': 130, 'tag': 'PER', 'text': 'Dakota Fanning', 'tsa_pol': 'Positive'}\n{'start': 0, 'end': 7, 'tag': 'PER', 'text': 'De Niro', 'tsa_pol': 'Positive'}\nHorribel gjemsel\nRegi : John Polson .\nMed Robert De Niro , Dakota Fanning , Famke Janssen , Elisabeth Shue , Amy Irving .\nAmerikansk .\nThriller - 15 år , 