In [4]:
import json
import sys
import numpy as np

UTILITIES = "./utilities"
sys.path.append(UTILITIES)
from utilities.content_processor import tokenise_text
from utilities.paper_access import get_text

In [5]:
def statistics(index, llm_model, metrics):

    with open(f"{INPUTS}/kg_test_{index}_{llm_model}.json") as f:
        kg = json.load(f)

    # Title
    print(f"Title: {kg["title"]} (by {llm_model})")


    # Calculate Tokens
    tokens = 0
    for section in kg["sections"]:
        for paragraph in section["paragraphs"]:
            for sentence in paragraph["sentences"]:
                text = get_text(sentence)
                tokens += len(tokenise_text(text))


    # Tokens
    print(f"Tokens: {tokens}")
    metrics["tokens"].append(tokens)


    # Entities
    print(f"Entities: {len(kg["nodes"])}")
    metrics["entities"].append(len(kg["nodes"]))


    # Mentions
    mentions = 0
    for entity in kg["nodes"].values():
        mentions += len(entity["mentions"])
    print(f"Mentions: {mentions}")
    metrics["mentions"].append(mentions)

    # Relations (Total)
    print(f"Relations (Total): {len(kg["triples"]) + len(kg["triples_typing"])}")
    metrics["relations_total"].append(len(kg["triples"]) + len(kg["triples_typing"]))

    # Relations (Normal)
    print(f"Relations (Normal): {len(kg["triples"])}")
    metrics["relations_normal"].append(len(kg["triples"]))

    # Relations (Typing)
    print(f"Relations (Typing): {len(kg["triples_typing"])}")
    metrics["relations_typing"].append(len(kg["triples_typing"]))


    # Isolated Entities
    non_isolated_entities = []

    for triple in kg["triples"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)

    print(f"Isolated Entities: {len(isolated_entities)}")
    metrics["isolated_entities"].append(len(isolated_entities))


    # Isolated Entities

    non_isolated_entities = []

    for triple in kg["triples"] + kg["triples_typing"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)
    print(f"Isolated Entities (including taxonomy): {len(isolated_entities)}")
    metrics["isolated_entities_typing"].append(len(isolated_entities))


    # Runtime
    times = kg["times"]
    print()
    print(f"Stage 0 (Input Pre-processing): {(sum(times[0:1]) / 60):.4g} minutes")
    metrics["stage_0"].append(sum(times[0:1]) / 60)
    print(f"Stage 1 (Entity Mention Extraction): {(sum(times[1:6]) / 60):.4g} minutes")
    metrics["stage_1"].append(sum(times[1:6]) / 60)
    print(f"Stage 2 (Coreference Resolution): {(sum(times[6:13]) / 60):.4g} minutes")
    metrics["stage_2"].append(sum(times[6:13]) / 60)
    print(f"Stage 3 (Local Relation Extraction): {(sum(times[13:16]) / 60):.4g} minutes")
    metrics["stage_3"].append(sum(times[13:16]) / 60)
    print(f"Stage 4 (Global Relation Extraction): {(sum(times[16:20]) / 60):.4g} minutes")
    metrics["stage_4"].append(sum(times[16:20]) / 60)
    print(f"Stage 5 (Schema Generation): {(sum(times[20:25]) / 60):.4g} minutes")
    metrics["stage_5"].append(sum(times[20:25]) / 60)
    print(f"Stage 6 (Data Post-processing): {(sum(times[25:26]) / 60):.4g} minutes")
    metrics["stage_6"].append(sum(times[25:26]) / 60)
    print(f"Total Runtime: {(sum(times) / 60):.4g} minutes")
    metrics["total_runtime"].append(sum(times) / 60)

In [6]:
DATASET = "ASKG"
INPUTS = f"./data/input/{DATASET}"

max_paper = 5
llm_model = "ablation"

metrics = {
    "tokens": [],
    "entities": [],
    "mentions": [],
    "relations_total": [],
    "relations_normal": [],
    "relations_typing": [],
    "isolated_entities": [],
    "isolated_entities_typing": [],
    "stage_0": [],
    "stage_1": [],
    "stage_2": [],
    "stage_3": [],
    "stage_4": [],
    "stage_5": [],
    "stage_6": [],
    "total_runtime": []
}

for index in range(1, max_paper + 1):
    statistics(index, llm_model, metrics)
    print()
    print()
    print("--------------------------------------------------")
    print()
    print()

for key, value in metrics.items():
    print(f"{key}: {np.mean(value):.4g} +/- {np.std(value):.4g}")

Title: MEL: Metadata Extractor & Loader (by ablation)
Tokens: 1165
Entities: 130
Mentions: 243
Relations (Total): 212
Relations (Normal): 124
Relations (Typing): 88
Isolated Entities: 71
Isolated Entities (including taxonomy): 37

Stage 0 (Input Pre-processing): 1.466e-06 minutes
Stage 1 (Entity Mention Extraction): 6.336 minutes
Stage 2 (Coreference Resolution): 8.347 minutes
Stage 3 (Local Relation Extraction): 3.207 minutes
Stage 4 (Global Relation Extraction): 11.06 minutes
Stage 5 (Schema Generation): 4.778 minutes
Stage 6 (Data Post-processing): 0.001667 minutes
Total Runtime: 33.73 minutes


--------------------------------------------------


Title: Modeling Actuations in BCI-O: A Context-based Integration of SOSA and IoT-O (by ablation)
Tokens: 3161
Entities: 322
Mentions: 747
Relations (Total): 700
Relations (Normal): 383
Relations (Typing): 317
Isolated Entities: 168
Isolated Entities (including taxonomy): 73

Stage 0 (Input Pre-processing): 2.587e-06 minutes
Stage 1 (Entity