In [1]:
import json
import sys
import numpy as np

UTILITIES = "../"
sys.path.append(UTILITIES)
from utilities.content_processor import tokenise_text
from utilities.paper_access import get_text

[nltk_data] Downloading package punkt to /home/eiri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/eiri/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eiri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/eiri/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eiri/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/eiri/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/eiri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def statistics(dataset, index, llm_model, table, input_path):

    with open(f"{input_path}/kg_{dataset}_{index}_{llm_model}.json") as f:
        kg = json.load(f)

    # Title
    title = f"Title: {kg["title"]} (by {llm_model})"
    print(title)
    table["titles"].append(title)


    # Calculate Tokens
    tokens = 0
    for section in kg["sections"]:
        for paragraph in section["paragraphs"]:
            for sentence in paragraph["sentences"]:
                text = get_text(sentence)
                tokens += len(tokenise_text(text))


    # Tokens
    print(f"Tokens: {tokens}")
    table["tokens"].append(tokens)


    # Entities
    print(f"Entities: {len(kg["nodes"])}")
    table["entities"].append(len(kg["nodes"]))


    # Mentions
    mentions = 0
    for entity in kg["nodes"].values():
        mentions += len(entity["mentions"])
    print(f"Mentions: {mentions}")
    table["mentions"].append(mentions)

    # Relations (Total)
    print(f"Relations (Total): {len(kg["triples"]) + len(kg["triples_typing"])}")
    table["relations_total"].append(len(kg["triples"]) + len(kg["triples_typing"]))

    # Relations (Normal)
    print(f"Relations (Normal): {len(kg["triples"])}")
    table["relations_normal"].append(len(kg["triples"]))

    # Relations (Typing)
    print(f"Relations (Typing): {len(kg["triples_typing"])}")
    table["relations_typing"].append(len(kg["triples_typing"]))


    # Isolated Entities
    non_isolated_entities = []

    for triple in kg["triples"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)

    print(f"Isolated Entities: {len(isolated_entities)}")
    table["isolated_entities"].append(len(isolated_entities))


    # Isolated Entities

    non_isolated_entities = []

    for triple in kg["triples"] + kg["triples_typing"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)
    print(f"Isolated Entities (including taxonomy): {len(isolated_entities)}")
    table["isolated_entities_typing"].append(len(isolated_entities))


    # Runtime
    times = kg["times"]
    print()
    print(f"Stage 0 (Input Pre-processing): {(sum(times[0:1]) / 60):.4g} minutes")
    table["stage_0"].append(sum(times[0:1]) / 60)
    print(f"Stage 1 (Entity Mention Extraction): {(sum(times[1:6]) / 60):.4g} minutes")
    table["stage_1"].append(sum(times[1:6]) / 60)
    print(f"Stage 2 (Coreference Resolution): {(sum(times[6:13]) / 60):.4g} minutes")
    table["stage_2"].append(sum(times[6:13]) / 60)
    print(f"Stage 3 (Local Relation Extraction): {(sum(times[13:16]) / 60):.4g} minutes")
    table["stage_3"].append(sum(times[13:16]) / 60)
    print(f"Stage 4 (Global Relation Extraction): {(sum(times[16:20]) / 60):.4g} minutes")
    table["stage_4"].append(sum(times[16:20]) / 60)
    print(f"Stage 5 (Schema Generation): {(sum(times[20:25]) / 60):.4g} minutes")
    table["stage_5"].append(sum(times[20:25]) / 60)
    print(f"Stage 6 (Data Post-processing): {(sum(times[25:26]) / 60):.4g} minutes")
    table["stage_6"].append(sum(times[25:26]) / 60)
    print(f"Total Runtime: {(sum(times) / 60):.4g} minutes")
    table["total_runtime"].append(sum(times) / 60)

In [3]:
dataset = "ASKG"
max_paper = 10
llm_model = "l"
input_path = f"../../data/input"
output_path = f"../../data/raw_results/gen_{dataset}_{llm_model}.csv"

table = {
    "titles": [],
    "tokens": [],
    "entities": [],
    "mentions": [],
    "relations_total": [],
    "relations_normal": [],
    "relations_typing": [],
    "isolated_entities": [],
    "isolated_entities_typing": [],
    "stage_0": [],
    "stage_1": [],
    "stage_2": [],
    "stage_3": [],
    "stage_4": [],
    "stage_5": [],
    "stage_6": [],
    "total_runtime": []
}

for index in range(1, max_paper + 1):
    statistics(dataset, index, llm_model, table, input_path)
    print()
    print()
    print("--------------------------------------------------")
    print()
    print()


for key, value in table.items():
    if key == "titles":
        table[key].append("Mean")
        table[key].append("STD")
    else:
        mean = float(np.mean(value))
        std = float(np.std(value))
        table[key].append(mean)
        table[key].append(mean)


import pandas as pd

# Convert the dictionary into a DataFrame
df = pd.DataFrame(table)

# Save to CSV file
df.to_csv(output_path, index=False)

print(f"Table saved as {output_path}")


display(df)

Title: MEL: Metadata Extractor & Loader (by l)
Tokens: 1165
Entities: 130
Mentions: 243
Relations (Total): 212
Relations (Normal): 124
Relations (Typing): 88
Isolated Entities: 71
Isolated Entities (including taxonomy): 37

Stage 0 (Input Pre-processing): 1.466e-06 minutes
Stage 1 (Entity Mention Extraction): 6.336 minutes
Stage 2 (Coreference Resolution): 8.347 minutes
Stage 3 (Local Relation Extraction): 3.207 minutes
Stage 4 (Global Relation Extraction): 11.06 minutes
Stage 5 (Schema Generation): 4.778 minutes
Stage 6 (Data Post-processing): 0.001667 minutes
Total Runtime: 33.73 minutes


--------------------------------------------------


Title: Modeling Actuations in BCI-O: A Context-based Integration of SOSA and IoT-O (by l)
Tokens: 3161
Entities: 322
Mentions: 747
Relations (Total): 700
Relations (Normal): 383
Relations (Typing): 317
Isolated Entities: 168
Isolated Entities (including taxonomy): 73

Stage 0 (Input Pre-processing): 2.587e-06 minutes
Stage 1 (Entity Mention Extra

Unnamed: 0,titles,tokens,entities,mentions,relations_total,relations_normal,relations_typing,isolated_entities,isolated_entities_typing,stage_0,stage_1,stage_2,stage_3,stage_4,stage_5,stage_6,total_runtime
0,Title: MEL: Metadata Extractor & Loader (by l),1165.0,130.0,243.0,212.0,124.0,88.0,71.0,37.0,1e-06,6.336494,8.346878,3.207005,11.059271,4.777505,0.001667,33.72882
1,Title: Modeling Actuations in BCI-O: A Context...,3161.0,322.0,747.0,700.0,383.0,317.0,168.0,73.0,3e-06,18.745993,34.314044,9.63609,39.281661,13.182274,0.006508,115.166573
2,Title: Building An Open Source Linux Computing...,3062.0,441.0,753.0,789.0,366.0,423.0,225.0,105.0,5e-06,20.359018,34.132764,11.706237,25.239792,13.067368,0.006661,104.511846
3,Title: TNNT: The Named Entity Recognition Tool...,1486.0,183.0,339.0,404.0,210.0,194.0,104.0,43.0,4e-06,8.151864,12.254302,5.005016,25.311428,7.293719,0.001955,58.018288
4,Title: A Pipeline For Analysing Grant Applicat...,5183.0,583.0,1329.0,1207.0,436.0,771.0,311.0,118.0,4e-06,29.719659,63.921386,16.312191,36.546233,16.659788,0.005867,163.165128
5,Title: Active knowledge graph completion (by l),10529.0,1774.0,2789.0,4854.0,769.0,4085.0,1095.0,312.0,7e-06,64.931245,156.80517,35.83741,33.306689,30.323334,0.016726,321.22058
6,Title: An Analysis of Links in Wikidata (by l),7308.0,1110.0,1662.0,3319.0,528.0,2791.0,696.0,176.0,1.7e-05,39.567852,89.726633,20.40107,30.763815,20.863463,0.007862,201.330713
7,Title: BCI Ontology: A Context-based Sense and...,5103.0,1013.0,1396.0,2376.0,542.0,1834.0,578.0,166.0,2.6e-05,34.563572,71.407441,17.369851,38.024798,23.919947,0.007154,185.292789
8,Title: HDGI: A Human Device Gesture Interactio...,5719.0,950.0,1420.0,2551.0,518.0,2033.0,521.0,143.0,1.7e-05,32.320637,63.894312,17.781657,33.259173,20.364067,0.006908,167.626772
9,Title: Learning SHACL shapes from knowledge gr...,12113.0,1833.0,2899.0,4716.0,922.0,3794.0,1102.0,313.0,1.9e-05,69.642075,155.5198,38.603954,43.399076,33.521842,0.017116,340.703884
