# Author: ddukic

In [1]:
import xml.etree.ElementTree as ET
import spacy
from spacy import displacy
from spacy.tokens import Doc
import os
import pandas as pd
from tqdm import tqdm
import re
import string

nlp = spacy.load("en_core_web_lg")


def viz_text(tokens, tags):
    assert len(tokens) == len(tags)

    doc = Doc(nlp.vocab, words=tokens, ents=tags)

    displacy.render(
        doc,
        style="ent",
        options={
            "tag": ["Trigger"],
            "colors": {"Trigger": "#ff6961"},
        },
    )

In [2]:
def check_length_of_triggers(file):
    try:
        tree = ET.parse(file)
        root = tree.getroot()
        for x in root:
            if x.tag == "Events":
                for event in x:
                    if len(event[0].text.split(" ")) > 1:
                        print(event[0].text)
        return False
    except:
        return True

In [3]:
def check_set(dataset="train"):
    skip_ratio = 0

    files = os.listdir(os.path.join("../data/raw/evextra/", dataset))

    for fpath in files:
        with open(os.path.join("../data/raw/evextra", dataset, fpath), "r") as file:
            if check_length_of_triggers(file):
                skip_ratio += 1
    print("Skip ratio: ", skip_ratio, "/", len(files))

check_set("train")
check_set("valid")
check_set("test")

Skip ratio:  0 / 531
Skip ratio:  0 / 76
Skip ratio:  0 / 152


In [4]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [5]:
def parse_xml(file):
    fname = file.name.split("/")[-1]
    tree = ET.parse(file)
    root = tree.getroot()
    events = []
    sentences = []
    tokens = []
    for x in root:
        if x.tag == "Events":
            for event in x:
                if event[0].tag == "Event":
                    event_text = event[0].text
                # position in text is mostly useless
                events.append(event_text)
        if x.tag == "Sentences":
            for sentence in x:
                tokens = []
                if sentence[0].tag == "Text":
                    for token in sentence[2]:
                        if token[0].tag == "Value":
                            tokens.append(token[0].text)
                    sentences.append(tokens)
    bio_tags = []
    event_glue_position = 0
    for sentence in sentences:
        bio_tags_sentence = []
        for i, token in enumerate(sentence):
            # the dataset if full of errors
            if (
                event_glue_position < len(events)
                and events[event_glue_position] == token
            ):
                bio_tags_sentence.append("B-Trigger")
                event_glue_position += 1
            # fallback
            elif (
                event_glue_position < len(events)
                and events[event_glue_position] in token
            ):
                # check the neighborhood, heuristic of 5 tokens
                if events[event_glue_position] in sentence[max(0, i - 5) : i + 5]:
                    bio_tags_sentence.append("O")
                    continue
                else:
                    # in this dataset there are only one trigger words
                    bio_tags_sentence.append("B-Trigger")
                    event_glue_position += 1
            else:
                bio_tags_sentence.append("O")

        bio_tags.append(bio_tags_sentence)

    if sum([True if x == "B-Trigger" else False for x in flatten(bio_tags)]) != len(
        events
    ):
        print(
            "fail",
            fname,
            sum([True if x == "B-Trigger" else False for x in flatten(bio_tags)]),
            "!=",
            len(events),
        )
        return [], []

    return sentences, bio_tags

In [6]:
with open("../data/raw/evextra/train/PeloponnesianWar.xml") as f:
    toks, tags = parse_xml(f)

In [7]:
viz_text(toks[0], tags[0])

In [8]:
def tokenize_create_bio_tags(dataset="train"):
    records = []

    files = os.listdir(os.path.join("../data/raw/evextra/", dataset))

    for fpath in tqdm(files):
        with open(os.path.join("../data/raw/evextra", dataset, fpath), "r") as file:
            tokens, tags = parse_xml(file)
            if len(tokens) > 0:
                for i, (tk, tg) in enumerate(zip(tokens, tags)):
                    assert len(tk) == len(tg)
                    records.append({
                        "file": fpath,
                        "sent_id": i,
                        "tokens": tk,
                        "bio_tags": tg
                    })
            else:
                print("Skipping!")
    return records

In [9]:
df_train = pd.DataFrame.from_records(tokenize_create_bio_tags("train"))
df_valid = pd.DataFrame.from_records(tokenize_create_bio_tags("valid"))
df_test = pd.DataFrame.from_records(tokenize_create_bio_tags("test"))

  1%|          | 3/531 [00:00<00:19, 27.37it/s]

fail article-15909.xml 5 != 6
Skipping!
fail article-633.xml 20 != 24
Skipping!


 22%|██▏       | 119/531 [00:03<00:14, 28.60it/s]

fail article-4063.xml 7 != 33
Skipping!


 31%|███       | 165/531 [00:05<00:12, 30.28it/s]

fail article-19903.xml 8 != 53
Skipping!
fail article-18545.xml 25 != 33
Skipping!


 33%|███▎      | 173/531 [00:05<00:12, 28.11it/s]

fail article-11039.xml 14 != 37
Skipping!


 44%|████▍     | 235/531 [00:07<00:09, 30.14it/s]

fail article-13154.xml 9 != 106
Skipping!


 46%|████▌     | 243/531 [00:08<00:10, 28.32it/s]

fail article-2909.xml 20 != 25
Skipping!


 63%|██████▎   | 334/531 [00:11<00:07, 25.96it/s]

fail article-10759.xml 69 != 93
Skipping!


 66%|██████▌   | 350/531 [00:11<00:06, 29.06it/s]

fail article-13301.xml 8 != 24
Skipping!


 68%|██████▊   | 361/531 [00:12<00:06, 25.32it/s]

fail article-13822.xml 12 != 25
Skipping!


 76%|███████▋  | 405/531 [00:13<00:04, 30.92it/s]

fail article-22966.xml 22 != 24
Skipping!


 80%|███████▉  | 423/531 [00:14<00:03, 33.81it/s]

fail article-10500.xml 2 != 31
Skipping!


 81%|████████  | 431/531 [00:14<00:03, 28.34it/s]

fail article-2736.xml 4 != 11
Skipping!
fail article-15248.xml 4 != 48
Skipping!


 86%|████████▌ | 456/531 [00:15<00:02, 27.84it/s]

fail article-13533.xml 7 != 9
Skipping!


100%|██████████| 531/531 [00:18<00:00, 29.46it/s]
 12%|█▏        | 9/76 [00:00<00:02, 28.35it/s]

fail article-9052.xml 16 != 54
Skipping!


100%|██████████| 76/76 [00:02<00:00, 29.60it/s]
 18%|█▊        | 28/152 [00:01<00:06, 20.25it/s]

fail article-5182.xml 11 != 19
Skipping!


 44%|████▍     | 67/152 [00:02<00:02, 31.24it/s]

fail article-10529.xml 60 != 72
Skipping!


 99%|█████████▊| 150/152 [00:05<00:00, 27.98it/s]

fail article-5402.xml 27 != 30
Skipping!
fail article-15696.xml 40 != 50
Skipping!


100%|██████████| 152/152 [00:05<00:00, 26.46it/s]


In [10]:
len(df_train), len(df_valid), len(df_test)

(8534, 1103, 2482)

In [None]:
# Visualize some examples
for _, record in df_train.sample(30, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [None]:
# Visualize some examples
for _, record in df_valid.sample(10, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [None]:
# Visualize some examples
for _, record in df_test.sample(30, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [14]:
df_train.to_json("../data/processed/evextra/train.json", orient="records")
df_valid.to_json("../data/processed/evextra/valid.json", orient="records")
df_test.to_json("../data/processed/evextra/test.json", orient="records")