# Author: ddukic

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Your XML data

import spacy

nlp = spacy.load("en_core_web_lg")


def prepare_absa(xml_file_path):
    tree = ET.parse(xml_file_path)

    root = tree.getroot()

    tokens_all = []
    tags_all = []

    for sentence in root.findall(".//sentence"):
        sentence_id = sentence.get("id")
        text = sentence.find("text").text

        terms = []

        for aspect_term in sentence.findall(".//aspectTerm"):
            terms.append(
                {
                    "polarity": aspect_term.get("polarity"),
                    "from": int(aspect_term.get("from")),
                    "to": int(aspect_term.get("to")),
                }
            )

        tokens = nlp(text)
        bio_tags = ["O"] * len(tokens)
        tokens_text = []

        for i in range(len(terms)):
            label = terms[i]["polarity"]
            args_start = terms[i]["from"]
            args_end = terms[i]["to"]
            out = tokens.char_span(args_start, args_end)
            if out is not None:
                start_tok, end_tok = out.start, out.end
                if start_tok == end_tok:
                    bio_tags[start_tok] = "B-" + label
                else:
                    bio_tags[start_tok] = "B-" + label
                    for j in range(start_tok + 1, end_tok):
                        bio_tags[j] = "I-" + label
            else:
                print(sentence_id + " Error with match of " + str(terms[i]) + "\n")

        for t in tokens:
            tokens_text.append(t.text)

        tokens_all.append(tokens_text)
        tags_all.append(bio_tags)

    return tokens_all, tags_all

In [2]:
xml_file_path_train = "../data/raw/absa/Restaurants_Train_v2.xml"
xml_file_path_test = "../data/raw/absa/Restaurants_Test_Gold.xml"

# tokens, tags = prepare_absa(xml_file_path_train)
tokens, tags = prepare_absa(xml_file_path_test)

11610050#517598#0 Error with match of {'polarity': 'positive', 'from': 23, 'to': 27}

35391416#500259#5 Error with match of {'polarity': 'negative', 'from': 10, 'to': 16}



In [3]:
from spacy import displacy
from spacy.tokens import Doc


def viz_text(tokens, tags):
    assert len(tokens) == len(tags)

    doc = Doc(nlp.vocab, words=tokens, ents=tags)

    displacy.render(
        doc,
        style="ent",
        options={
            "ents": ["positive", "negative", "neutral", "conflict"],
            "colors": {
                x: "#ff6961" for x in ["positive", "negative", "neutral", "conflict"]
            },
        },
    )

In [4]:
for i in range(100):
    viz_text(tokens[i], tags[i])



In [2]:
import pandas as pd
from ast import literal_eval

train, valid, test = (
    pd.read_csv("../data/processed/absa/train.csv"),
    pd.read_csv("../data/processed/absa/valid.csv"),
    pd.read_csv("../data/processed/absa/test.csv"),
)

train["tokens"] = train["tokens"].apply(literal_eval)
train["tags"] = train["tags"].apply(literal_eval)

valid["tokens"] = valid["tokens"].apply(literal_eval)
valid["tags"] = valid["tags"].apply(literal_eval)

test["tokens"] = test["tokens"].apply(literal_eval)
test["tags"] = test["tags"].apply(literal_eval)

In [21]:
len(train), len(valid), len(test)

(2737, 304, 800)

In [13]:
train.tags.explode().value_counts()

tags
O             38013
B-positive     1949
I-positive      949
B-negative      726
B-neutral       567
I-neutral       218
I-negative      191
B-conflict       77
I-conflict       23
Name: count, dtype: int64