# Author: ddukic

In [1]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import spacy
from spacy import displacy
from spacy.tokens import Doc
import re
import os
from tqdm import tqdm
import pandas as pd

nlp = spacy.load("en_core_web_lg")


def viz_text(tokens, tags):
    assert len(tokens) == len(tags)

    doc = Doc(nlp.vocab, words=tokens, ents=tags)

    displacy.render(
        doc,
        style="ent",
        options={
            "tag": ["Trigger"],
            "colors": {"Trigger": "#ff6961"},
        },
    )

In [2]:
def check_length_of_triggers(file):
    try:
        tree = ET.parse(file)
        root = tree.getroot()
        for x in root:
            if x.tag == "sentence":
                for event in x:
                    if len(event.text.split(" ")) > 1:
                        print(event.text)
        return False
    except:
        return True

In [3]:
def check_set(dataset="train"):
    skip_ratio = 0

    files = os.listdir(os.path.join("../data/raw/ednyt/", dataset))

    for fpath in files:
        with open(os.path.join("../data/raw/ednyt", dataset, fpath), "r") as file:
            if check_length_of_triggers(file):
                skip_ratio += 1
    print("Skip ratio: ", skip_ratio, "/", len(files))

check_set("train")
check_set("valid")
check_set("test")

Skip ratio:  42 / 1900
Skip ratio:  5 / 100
Skip ratio:  1 / 200


In [4]:
def parse_xml(file):
    try:
        tree = ET.parse(file)
        root = tree.getroot()
        events = []
        bio_tags = []
        words = []
        for x in root:
            if x.tag == "sentence":
                sentence = (
                    re.findall(
                        r"<sentence>(.*?)<\/sentence>",
                        ET.tostring(root).decode("utf-8"),
                        re.DOTALL,
                    )[0]
                    .replace("<event>", "")
                    .replace("</event>", "")
                    .replace("\n", "")
                )
                for event in x:
                    events.append(event.text)
                tokens = nlp(sentence)
                counter = 0
                for token in tokens:
                    if token.text != '' and token.text != ' ':
                        words.append(token.text)
                        if counter < len(events) and token.text == events[counter]:
                            # in this dataset there are only one-trigger words
                            bio_tags.append("B-Trigger")
                            counter += 1
                        else:
                            bio_tags.append("O")
                assert len(events) == len([x for x in bio_tags if x == "B-Trigger"])
        return sentence, words, bio_tags
    except:
        return None, None, None

In [5]:
def tokenize_create_bio_tags(dataset="train"):
    skip_ratio = 0

    records = []

    files = os.listdir(os.path.join("../data/raw/ednyt/", dataset))

    for fpath in tqdm(files):
        with open(os.path.join("../data/raw/ednyt", dataset, fpath), "r") as file:
            sentence, toks, tags = parse_xml(file)
            if sentence is None:
                skip_ratio += 1
            else:
                records.append({
                    "file": fpath,
                    "tokens": toks,
                    "bio_tags": tags
                })
    print("Skip ratio: ", skip_ratio, "/", len(files))
    return records

In [6]:
df_train = pd.DataFrame.from_records(tokenize_create_bio_tags("train"))
df_valid = pd.DataFrame.from_records(tokenize_create_bio_tags("valid"))
df_test = pd.DataFrame.from_records(tokenize_create_bio_tags("test"))

100%|██████████| 1900/1900 [00:17<00:00, 108.24it/s]


Skip ratio:  58 / 1900


100%|██████████| 100/100 [00:00<00:00, 120.79it/s]


Skip ratio:  5 / 100


100%|██████████| 200/200 [00:01<00:00, 124.90it/s]

Skip ratio:  2 / 200





In [None]:
# Visualize some examples
for _, record in df_train.sample(20, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [None]:
# Visualize some examples
for _, record in df_valid.sample(10, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [None]:
# Visualize some examples
for _, record in df_test.sample(10, random_state=42).iterrows():
    print(record["file"])
    print("----------------")
    viz_text(record["tokens"], record["bio_tags"])

In [10]:
len(df_train), len(df_valid), len(df_test)

(1842, 95, 198)

In [11]:
df_train.to_json("../data/processed/ednyt/train.json", orient="records")
df_valid.to_json("../data/processed/ednyt/valid.json", orient="records")
df_test.to_json("../data/processed/ednyt/test.json", orient="records")