# Model ELSA with NER and TSA
- Create a dataset with only PER and ORG targets
- Use the NoReCfine dev and train data not in the 50 docs
- Create a training set with only NER targets as targets
- Train TSA model with these data
- Do inference on the 50 docs
- Resolve entities as before
- Count ELSA entity-level performance


Obsidian: `exp_elsa-modelling-from-tsa.md`
Conda: `transform`

## Import the conll-formatted NoReC-fine TSA dataset
- Join train and dev
- split out the 50 documents  

We have annotated 50 documents for sentiment towards each volitional entity. Due to its exploratory character have we taken these data from the train split. That may not have been the best decision, but we mitigate that by using the rest of train and dev data for training. We do not touch the official test split for now, in case it will be important later, that these data were not seen.

In [1]:
import json
import os
import time
from collections import Counter, defaultdict
import numpy as np
import random
import pandas as pd
import torch
from helpers import *
from transformers import  pipeline
from tqdm import tqdm
from seqeval.metrics import classification_report
import logging
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
from datasets import ClassLabel, Sequence, Value, load_dataset, load_from_disk

In [9]:
conll_folder = "norec_tsa/tsa_conll"
elsa_target_folder = "data/elsa_conll"
cache_path = "data/elsa_cache.json" # Delete that file to recreate the data
elsa_ds_folder = "data/ds_elsa"
# elsapol_folder = "data/ds_elsapol"
for f in [conll_folder, elsa_target_folder ,elsa_ds_folder ]:
    if not os.path.exists(f):
        os.mkdir(f)

elsa_test = ["300040", "107011", "201849", "301323", "106679", "109778", "004340", "102785", "105949", "109227", "101882", "601171", "107972", "103164", "003939", "702913", "201734", "300178", "003717", "600774", "500437", "704907", "200937", "109021", "501037", "302181", "702152", "103447", "702956", "100866", "202792", "602054", "302663", "201470", "004230", "110613", "702761", "001061", "300746", "102095", "304135", "100120", "105165", "501319", "500921", "305169", "200607", "108264", "111035", "107563"]
separator = "\t"
ner_model = None
def instanciate_model():
    ner_model = pipeline(task='ner', 
        model= 'saattrupdan/nbailab-base-ner-scandi', 
        aggregation_strategy='first')
    return ner_model

In [10]:
if not os.path.exists(cache_path):
    ner_model = instanciate_model()
    conll_splits = []
    for split in ["train", "dev"]:
        with open(os.path.join(conll_folder, split+".conll"), encoding="UTF-8") as rf:
            conll_splits.append(rf.read().strip())
    conll_sents = "\n\n".join(conll_splits).split("\n\n")
    sents = [] # dict with sent_id as key, dict with various data as value
    for sent in conll_sents:
        #Extract sentence_id: #sent_id=201911-02-01
        lines = sent.split("\n")
        assert lines[0].startswith("#sent_id") and separator not in lines[0]
        assert all([separator in l for l in lines[1:]])
        sent_id = lines.pop(0).split("=")[1]
        sent_data = {"sent_id": sent_id,
                    "doc_id" : sent_id.split("-")[0], 
                    "conll_text": "\n".join(lines)}
        sent_data["tokens"], sent_data["tsa_tags"] = conn_tolist(sent_data["conll_text"], sep=separator)
        sent_data["text"] = " ".join(sent_data["tokens"])
        sent_data["ners"] = pred_ranges(ner_model(sent_data["text"]))
        sent_data["split"] = "test" if sent_data["doc_id"] in elsa_test else "train"
        sents.append(sent_data)
    with open (cache_path, "w", encoding = "utf-8") as wf:
        json.dump(sents, wf, ensure_ascii=False )


## Add the NE predictions to conll files
- Convert the character-based boundaries from the NE pipeline to one tag per token
- Join the TSA and NER tags: If overlap, add polarity to NER tag. If not, discard entity

In [11]:
def ner_tag(sent):
    """Receives one sent dict, returns a list of the NE tags for the sentence, based on all the NE raw data for each sentence from the NER pipeline earlier
       Filtering for PER and ORG happens here
    """
    ners = [ne for ne in sent["ners"] if ne["tag"] in ["PER", "ORG"]]
    ne_tags = ["O"]* len(sent["tokens"])
    for ne in ners:
        ongoing = False
        token_start = 0
        for idx, token in enumerate(sent["tokens"]):
            token_end = token_start+len(token)
            if token_start in range(ne["start"],ne["end"]) or token_end in range(ne["start"],ne["end"]) :
                # any overlap. The NER pipeline can re-tokenize words like Borten-regjeringen
                if ongoing:
                    first = "I"
                else:
                    first = "B"
                ongoing = True
                ne_tags[idx] = first+"-"+ne["tag"]
            else:
                ongoing = False # Not really needed ince we have a separate run for each NE, but still

            # print(sent["text"][token_start:token_end], token)
            token_start = token_end +1
    return ne_tags



In [12]:

with open(cache_path, encoding = "utf-8") as rf:
    sents = json.load(rf)
print(len(sents),len([s for s in sents if s["split"] == "test"]),len([s for s in sents if s["split"] == "train"]))
for idx, sent in enumerate(sents.copy()):
    sent["ne_tags"] = ner_tag(sent)
    ne_tagspans = tag_span(sent["ne_tags"])
    tsa_tagspans = tag_span(sent["tsa_tags"])
    sent["elsa_tags"] = spans_to_list(["O"]*len(sent["ne_tags"]), merge_tags(ne_tagspans,tsa_tagspans ))
    sent["elsapol_tags"] = compresstags(sent["elsa_tags"])
    sents[idx] = sent

with open (cache_path, "w", encoding = "utf-8") as wf:
    json.dump(sents, wf, ensure_ascii=False )

# Write elsa conll
conll_sents ={"train": [], "test":[] }# list of sentence conll texts
separator = "\t"
for sent in sents:
    sent_lines = ["#sent_id="+sent["sent_id"]]
    for token, tag in zip(sent["tokens"], sent["elsa_tags"]):
        sent_lines.append(token+separator+tag)
    
    conll_sents[sent["split"]].append("\n".join(sent_lines))

if not os.path.exists(elsa_target_folder):
    os.mkdir(elsa_target_folder)
for split, c_sents in conll_sents.items():
    path = os.path.join(elsa_target_folder, split+".conll")
    with open(path, "w", encoding = "utf-8") as wf:
        wf.write("\n\n".join(c_sents))


10165 1345 8820


## Convert ELSA data to hf datasets
- Create conversion table between tags and integers
- Create pd.DataFrame with id, tokens and elsa_tags for each split
- Create dataset, update info / features.
- Create and save datasetdict.

In [13]:
sents[900]["sent_id"], sents[900]["tokens"], sents[900]["elsa_tags"]
keysets = {key:set()for key in ["tsa_tags", "ne_tags", "elsa_tags", "elsapol_tags"]}
keylists =  {key:[]for key in keysets}


for key in keysets:
    for sent in sents:
        keysets[key].update(sent[key])
    # print(key,keysets[key])
    keylist = list(keysets[key])
    keylist.remove("O")
    keylist.sort(key=lambda x: x[::-1])
    keylists[key] = ["O"]+keylist
label_to_ids = {key:{l:i for i, l in enumerate(keylist)} for key, keylist in keylists.items()} # Each text label gets their index position

sents_split = {split:[]for split in ["train", "test"]}
df_splits = {split:pd.DataFrame() for split in sents_split}
for sent in sents:
    sents_split[sent["split"]].append(sent)
[print(len(l)) for l in sents_split.values()]
for split, s_sents in sents_split.items():
    df_splits[split]["id"] = [el["sent_id"] for el in s_sents]
    df_splits[split]["tokens"] = [el["tokens"] for el in s_sents]
    for col in keysets:
        new_col =  []
        for s_sent in s_sents:
            new_col.append([label_to_ids[col][l] for l in s_sent[col]])
        print(split, col,len(new_col))
        df_splits[split][col] = new_col

df_splits["train"]



8820
1345
train tsa_tags 8820
train ne_tags 8820
train elsa_tags 8820
train elsapol_tags 8820
test tsa_tags 1345
test ne_tags 1345
test elsa_tags 1345
test elsapol_tags 1345


Unnamed: 0,id,tokens,tsa_tags,ne_tags,elsa_tags,elsapol_tags
0,201911-01-01,"[Philips, 190G6]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
1,201911-02-01,"[Med, integrerte, høyttalere, som, på, ingen, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,201911-02-02,"[Eller, bedrar, skinnet, ?]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]"
3,201911-03-01,"[De, fleste, skjermer, har, et, diskret, desig...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,201911-03-02,"[Men, 190G6, fra, Philips, er, en, helt, annen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 9, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 5, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...
8815,500718-10-03,"[Dette, igjen, stiller, store, krav, til, diks...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8816,500718-10-04,"[Især, Thomas, Bipin, Olsen, sliter, til, tide...","[0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ...","[0, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11,...","[0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ..."
8817,500718-10-05,"[Også, Iselin, Shumba, ,, med, sin, sceniske, ...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 3, 4, ...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 3, 4, ...","[0, 7, 8, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 8, ...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 3, 4, ..."
8818,500718-10-06,"[Sistnevnte, viser, seg, også, som, en, fin, k...","[3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
# Count the various tags
for split in ["train", "test"]:
    for key in ["tsa_tags", "ne_tags", "elsa_tags", "elsapol_tags"]:
        tags = [t for sent in sents for t in sent[key] if sent["split"] == split]
        print(split, key, Counter(tags).most_common())

train tsa_tags [('O', 138113), ('B-targ-Positive', 3603), ('I-targ-Positive', 3541), ('B-targ-Negative', 1608), ('I-targ-Negative', 1535)]
train ne_tags [('O', 142116), ('B-PER', 3109), ('I-PER', 1851), ('B-ORG', 882), ('I-ORG', 442)]
train elsa_tags [('O', 142116), ('B-PER-Neutral', 2380), ('I-PER-Neutral', 1400), ('B-ORG-Neutral', 669), ('B-PER-Positive', 565), ('I-PER-Positive', 361), ('I-ORG-Neutral', 321), ('B-PER-Negative', 164), ('B-ORG-Positive', 152), ('I-PER-Negative', 90), ('I-ORG-Positive', 89), ('B-ORG-Negative', 61), ('I-ORG-Negative', 32)]
train elsapol_tags [('O', 142116), ('B-Neutral', 3049), ('I-Neutral', 1721), ('B-Positive', 717), ('I-Positive', 450), ('B-Negative', 225), ('I-Negative', 122)]
test tsa_tags [('O', 20418), ('B-targ-Positive', 504), ('I-targ-Positive', 442), ('B-targ-Negative', 206), ('I-targ-Negative', 181)]
test ne_tags [('O', 20862), ('B-PER', 494), ('I-PER', 296), ('B-ORG', 75), ('I-ORG', 24)]
test elsa_tags [('O', 20862), ('B-PER-Neutral', 381), (

In [15]:
# Build new features dict
new_features ={'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
for feat, f_names in keylists.items():
    new_features[feat] = Sequence(feature=ClassLabel(num_classes = len(f_names), names = f_names, id=None), length=-1, id=None)

# , 'pos_tags': Sequence(feature=ClassLabel(num_classes=47, names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=ClassLabel(num_classes=23, names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}


In [16]:
# Create Dataset
elsa_dses = {"train":None, "test":None}
for split in elsa_dses:
    elsa_dses[split] = datasets.Dataset.from_pandas(df_splits[split])
    elsa_dses[split].features.update(new_features)
    # print(elsa_dses[split].features)
elsa_dses["validation"] = datasets.Dataset.from_pandas(df_splits["test"])
dsd = datasets.DatasetDict(elsa_dses)
dsd.save_to_disk(elsa_ds_folder)
dsd

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'tsa_tags', 'ne_tags', 'elsa_tags', 'elsapol_tags'],
        num_rows: 8820
    })
    test: Dataset({
        features: ['id', 'tokens', 'tsa_tags', 'ne_tags', 'elsa_tags', 'elsapol_tags'],
        num_rows: 1345
    })
    validation: Dataset({
        features: ['id', 'tokens', 'tsa_tags', 'ne_tags', 'elsa_tags', 'elsapol_tags'],
        num_rows: 1345
    })
})

In [17]:
demo_ner = {'sent_id': '102727-04-02', 'doc_id': '102727', 'conll_text': 'Etter\tO\nfilmer\tO\nsom\tO\n«\tO\nSideways\tO\n»\tO\nog\tO\n«\tO\nAbout\tO\nSchmidt\tO\n»\tO\nmå\tO\nvel\tO\nregissør\tB-targ-Positive\nAlexander\tI-targ-Positive\nPayne\tI-targ-Positive\nsies\tO\nå\tO\nvære\tO\nen\tO\nslags\tO\nekspert\tO\npå\tO\ndette\tO\n.\tO', 'tokens': ['Etter', 'filmer', 'som', '«', 'Sideways', '»', 'og', '«', 'About', 'Schmidt', '»', 'må', 'vel', 'regissør', 'Alexander', 'Payne', 'sies', 'å', 'være', 'en', 'slags', 'ekspert', 'på', 'dette', '.'], 'tsa_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive', 'I-targ-Positive', 'I-targ-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'text': 'Etter filmer som « Sideways » og « About Schmidt » må vel regissør Alexander Payne sies å være en slags ekspert på dette .', 'ners': [{'start': 19, 'end': 27, 'tag': 'MISC', 'text': 'Sideways'}, {'start': 35, 'end': 48, 'tag': 'MISC', 'text': 'About Schmidt'}, {'start': 67, 'end': 82, 'tag': 'PER', 'text': 'Alexander Payne'}], 'split': 'train'}
# print(demo_ner["text"][67:82])
ne_tags = ner_tag(demo_ner)
for token, tsa, ner in zip(demo_ner["tokens"], demo_ner["tsa_tags"], ne_tags):
    print(token, tsa, ner)

n = 0
while n < 5:
    sent = random.choice(sents)
    if not all([t == "O" for t in sent["elsa_tags"]]):
        n += 1
        print()
        for key in ["text", "tsa_tags", "ne_tags", "elsa_tags"]:
            print(sent[key])


Etter O O
filmer O O
som O O
« O O
Sideways O O
» O O
og O O
« O O
About O O
Schmidt O O
» O O
må O O
vel O O
regissør B-targ-Positive O
Alexander I-targ-Positive B-PER
Payne I-targ-Positive I-PER
sies O O
å O O
være O O
en O O
slags O O
ekspert O O
på O O
dette O O
. O O

De kan sine klassikere på Brasserie Blanche .
['O', 'O', 'O', 'O', 'O', 'B-targ-Positive', 'I-targ-Positive', 'O']
['O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O']
['O', 'O', 'O', 'O', 'O', 'B-ORG-Positive', 'I-ORG-Positive', 'O']

Og er med på å gjøre Steinar til en fengende og særdeles innholdsrik Beck-episode .
['O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-PER-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Tracy er nemlig en konkurrent til en annen danser , Velmas egen datter .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B-PE