# Resolve the data to entity-level
The data_cache now has the information we need, on the sentence level. We resolve the entities for each document, and record the sentiment scores for the document and sentence(s) they occur in. We also resolve any sentiment directed towards them at the target level

In [57]:
# venv transform
import json
import os
import time
from collections import Counter, defaultdict
import numpy as np
import random
import pandas as pd
import torch
from helpers import *
from transformers import  pipeline
from tqdm import tqdm
from itertools import product


save_root = "outputs/"
tabular_savefolder = os.path.join(save_root, "tabular")
os.makedirs(tabular_savefolder, exist_ok=True)
data_cache = os.path.join(tabular_savefolder, "data_sentencewise.json")
with open(data_cache, encoding = "utf-8") as rf:
    dataset = json.load(rf)
inspect_ids = tuple(set([s["doc_id"] for s in dataset]))

# Resolve entities to document-level

In [58]:
def resolve_sent_pols(polarities):
    """List of sentence polarities the entity appears in.
    Most commom pos or neg wins. Mixed for tie"""
    strengths = {}
    for polarity in ['Positive', 'Negative', 'Neutral', ""]:
        strengths[polarity] = len([p for p in polarities if p == polarity])
        if strengths[polarity] == len(polarities):
            return polarity
    if strengths["Positive"] > strengths["Negative"]:
        return "Positive"
    if strengths["Negative"] > strengths["Positive"]:
        return "Negative"
    return "Mixed"
    

In [73]:
entity_dfs = [] # List of document-entity dataframes to simplify the merging
entity_dict = {}

for doc_id in inspect_ids:
    doc_ents = []
    doc_sents = [s for s in dataset if s["doc_id"] == doc_id]
    ne_groups = [] # List of lists of groups
    for sent in doc_sents:
        for named_e in sent["nes"]:
            # Collect entity with substring match and keep enough data for later
            ne = named_e["text"]
            found = False
            for ne_group in ne_groups:
                if any([ne in existing for existing in ne_group]) or any([existing in ne for existing in ne_group]):
                    ne_group.append(ne)
                    found = True
                    break
                else :
                    stripped = ne.rstrip("s").rstrip("'").rstrip()
                    if any([stripped in existing for existing in ne_group]):
                        ne_group.append(ne)
                        found = True
                        break

            if not found:
                ne_groups.append([ne])
        
    # Now, each named entity document-level is a list of apperances in the text.
    # Next step is to iterate these and make a dataframe
    doc_entities = {} # Resolved name as key, all mentions in value
    for s_forms in ne_groups:
        # e_ent is a list of dicts representing each surface form of same entity
        # Find longest text representat
        longest = max(s_forms, key=len)
        if longest.rstrip("s").rstrip("'").rstrip() in s_forms:
            longest = longest.rstrip("s").rstrip("'").rstrip()
        doc_entities[longest] = list(set(s_forms))

        
    # If we read the entities like ["John", "Wayne",  "John Wayne"] We get two lists because Wayne is not substring og John. Fixing this
    for one, two in product(doc_entities.copy(), doc_entities.copy()):
        if not one == two and one in two:
            doc_entities[two] += doc_entities[one]
            del doc_entities[one]
    
    # print(doc_id, doc_entities)

    "Double check no duplicate entries in different surface form lists"
    all_surface_forms = [f for  s_forms in doc_entities.values() for f in s_forms]
    assert len(all_surface_forms) == len(set(all_surface_forms))

    # Populate the entity with more data
    for longest, s_forms in doc_entities.items():
        doc_entities[longest] = {"surface_forms": s_forms,
            "doc_id": doc_id,
            "entity_id":  doc_id+"_"+"_".join(longest.split())
        }

    for longest, ent_data in doc_entities.copy().items():
        sents_having = []
        nes_belonging = []
        for sent in doc_sents:
            if any ([ne["text"] in ent_data["surface_forms"] for ne in sent["nes"]]):
                sents_having.append(sent)
            nes_belonging += [ne for ne in sent["nes"] if ne["text"] in ent_data["surface_forms"]]
        assert len(sents_having) > 0
        assert len((nes_belonging)) >= len(ent_data["surface_forms"])

        doc_entities[longest]["ne_cat"] = Counter([ne["tag"] for ne in nes_belonging]).most_common(1)[0][0]
        doc_entities[longest]["sentences_pol"] = [s["sentence_pol"] for s in sents_having]
        doc_entities[longest]["sent_pol_resolved"] = resolve_sent_pols(doc_entities[longest]["sentences_pol"])
        doc_entities[longest]["targets_pol"] = [t["tsa_pol"] for t in nes_belonging]
        doc_entities[longest]["targ_pol_resolved"] = resolve_sent_pols(doc_entities[longest]["targets_pol"])
        doc_entities[longest]["doc_rating"] = doc_sents[0]["doc_rating"]


    entity_dfs.append(pd.DataFrame(doc_entities).
                                            T.
                                            reset_index(level=0).
                                            rename({"index":"name"}, axis=1)
                                            )
    entity_dict.update(doc_entities)


merged_df = pd.concat(entity_dfs).reset_index()
merged_df = merged_df [["doc_id" , "doc_rating", "entity_id",  "name",	"surface_forms", "ne_cat",	"sentences_pol", 	"sent_pol_resolved", "targets_pol",	"targ_pol_resolved"]]
merged_df["manual_pol"] = ""

In [76]:
merged_df

Unnamed: 0,doc_id,doc_rating,entity_id,name,surface_forms,ne_cat,sentences_pol,sent_pol_resolved,targets_pol,targ_pol_resolved,manual_pol
0,106679,4,106679_Nelly_Furtado,Nelly Furtado,"[Furtado, Nelly Furtado]",PER,"[Neutral, Mixed, Mixed]",Mixed,"[, Negative, Negative]",Negative,
1,106679,4,106679_Geffen,Geffen,[Geffen],PER,[Neutral],Neutral,[],,
2,106679,4,106679_Universal,Universal,[Universal],ORG,[Neutral],Neutral,[],,
3,106679,4,106679_Missy_Elliotts,Missy Elliotts,[Missy Elliotts],PER,[Positive],Positive,[],,
4,106679,4,106679_Timbaland,Timbaland,"[Timbaland, Timbalands]",PER,"[Positive, Mixed]",Positive,"[Positive, ]",Positive,
...,...,...,...,...,...,...,...,...,...,...,...
288,100120,2,100120_Bill_Guttentag,Bill Guttentag,[Bill Guttentag],PER,[Neutral],Neutral,[],,
289,100120,2,100120_Eva_Mendes,Eva Mendes,[Eva Mendes],PER,"[Neutral, Mixed]",Mixed,"[, ]",,
290,100120,2,100120_David_Krumholtz,David Krumholtz,[David Krumholtz],PER,[Neutral],Neutral,[],,
291,100120,2,100120_Rob_Brown,Rob Brown,[Rob Brown],PER,[Neutral],Neutral,[],,


In [70]:
# Save the data
basepath = os.path.join(tabular_savefolder, "elsa_entities.")
merged_df.to_excel(basepath+"xlsx")
merged_df.to_pickle(basepath+"pk")
with open(basepath+"json", "w", encoding="utf-8") as wf:
    json.dump(entity_dict, wf, ensure_ascii=False)


In [None]:
# Now we have extracted all information we can, and can hands this table together with the printouts to the annotator