In [1]:
import json
import sys
import numpy as np

UTILITIES = "../"
sys.path.append(UTILITIES)
from utilities.content_processor import tokenise_text
from utilities.paper_access import get_text

[nltk_data] Downloading package punkt to /Users/thinksky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/thinksky/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thinksky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/thinksky/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/thinksky/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/thinksky/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nl

In [2]:
def statistics(dataset, index, llm_model, table, input_path):

    with open(f"{input_path}/kg_{dataset}_{index}_{llm_model}.json") as f:
        kg = json.load(f)

    # Title
    title = f"Title: {kg["title"]} (by {llm_model})"
    print(title)
    table["titles"].append(title)


    # Calculate Tokens
    tokens = 0
    for section in kg["sections"]:
        for paragraph in section["paragraphs"]:
            for sentence in paragraph["sentences"]:
                text = get_text(sentence)
                tokens += len(tokenise_text(text))


    # Tokens
    print(f"Tokens: {tokens}")
    table["tokens"].append(tokens)


    # Entities
    print(f"Entities: {len(kg["nodes"])}")
    table["entities"].append(len(kg["nodes"]))


    # Mentions
    mentions = 0
    for entity in kg["nodes"].values():
        mentions += len(entity["mentions"])
    print(f"Mentions: {mentions}")
    table["mentions"].append(mentions)

    # Relations (Total)
    print(f"Relations (Total): {len(kg["triples"]) + len(kg["triples_typing"])}")
    table["relations_total"].append(len(kg["triples"]) + len(kg["triples_typing"]))

    # Relations (Normal)
    print(f"Relations (Normal): {len(kg["triples"])}")
    table["relations_normal"].append(len(kg["triples"]))

    # Relations (Typing)
    print(f"Relations (Typing): {len(kg["triples_typing"])}")
    table["relations_typing"].append(len(kg["triples_typing"]))


    # Isolated Entities
    non_isolated_entities = []

    for triple in kg["triples"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)

    print(f"Isolated Entities: {len(isolated_entities)}")
    table["isolated_entities"].append(len(isolated_entities))


    # Isolated Entities

    non_isolated_entities = []

    for triple in kg["triples"] + kg["triples_typing"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)
    print(f"Isolated Entities (including taxonomy): {len(isolated_entities)}")
    table["isolated_entities_typing"].append(len(isolated_entities))


    # Runtime
    times = kg["times"]
    print()
    print(f"Stage 0 (Input Pre-processing): {(sum(times[0:1]) / 60):.4g} minutes")
    table["stage_0"].append(sum(times[0:1]) / 60)
    print(f"Stage 1 (Entity Mention Extraction): {(sum(times[1:6]) / 60):.4g} minutes")
    table["stage_1"].append(sum(times[1:6]) / 60)
    print(f"Stage 2 (Coreference Resolution): {(sum(times[6:13]) / 60):.4g} minutes")
    table["stage_2"].append(sum(times[6:13]) / 60)
    print(f"Stage 3 (Local Relation Extraction): {(sum(times[13:16]) / 60):.4g} minutes")
    table["stage_3"].append(sum(times[13:16]) / 60)
    print(f"Stage 4 (Global Relation Extraction): {(sum(times[16:20]) / 60):.4g} minutes")
    table["stage_4"].append(sum(times[16:20]) / 60)
    print(f"Stage 5 (Schema Generation): {(sum(times[20:25]) / 60):.4g} minutes")
    table["stage_5"].append(sum(times[20:25]) / 60)
    print(f"Stage 6 (Data Post-processing): {(sum(times[25:26]) / 60):.4g} minutes")
    table["stage_6"].append(sum(times[25:26]) / 60)
    print(f"Total Runtime: {(sum(times) / 60):.4g} minutes")
    table["total_runtime"].append(sum(times) / 60)

In [3]:
dataset = "ASKG"
max_paper = 10
llm_model = "g"
input_path = f"../../data/input"
output_path = f"../../data/raw_results/gen_{dataset}_{llm_model}.csv"

table = {
    "titles": [],
    "tokens": [],
    "entities": [],
    "mentions": [],
    "relations_total": [],
    "relations_normal": [],
    "relations_typing": [],
    "isolated_entities": [],
    "isolated_entities_typing": [],
    "stage_0": [],
    "stage_1": [],
    "stage_2": [],
    "stage_3": [],
    "stage_4": [],
    "stage_5": [],
    "stage_6": [],
    "total_runtime": []
}

for index in range(1, max_paper + 1):
    statistics(dataset, index, llm_model, table, input_path)
    print()
    print()
    print("--------------------------------------------------")
    print()
    print()


for key, value in table.items():
    if key == "titles":
        table[key].append("Mean")
        table[key].append("STD")
    else:
        mean = float(np.mean(value))
        std = float(np.std(value))
        table[key].append(mean)
        table[key].append(mean)


import pandas as pd

# Convert the dictionary into a DataFrame
df = pd.DataFrame(table)

# Save to CSV file
df.to_csv(output_path, index=False)

print(f"Table saved as {output_path}")


display(df)

Title: MEL: Metadata Extractor & Loader (by g)
Tokens: 1165
Entities: 182
Mentions: 302
Relations (Total): 514
Relations (Normal): 293
Relations (Typing): 221
Isolated Entities: 54
Isolated Entities (including taxonomy): 12

Stage 0 (Input Pre-processing): 1.156e-06 minutes
Stage 1 (Entity Mention Extraction): 10.54 minutes
Stage 2 (Coreference Resolution): 9.771 minutes
Stage 3 (Local Relation Extraction): 2.73 minutes
Stage 4 (Global Relation Extraction): 7.188 minutes
Stage 5 (Schema Generation): 17.35 minutes
Stage 6 (Data Post-processing): 0.002701 minutes
Total Runtime: 47.58 minutes


--------------------------------------------------


Title: Modeling Actuations in BCI-O: A Context-based Integration of SOSA and IoT-O (by g)
Tokens: 3161
Entities: 356
Mentions: 757
Relations (Total): 1345
Relations (Normal): 558
Relations (Typing): 787
Isolated Entities: 97
Isolated Entities (including taxonomy): 32

Stage 0 (Input Pre-processing): 2.138e-06 minutes
Stage 1 (Entity Mention Extra

Unnamed: 0,titles,tokens,entities,mentions,relations_total,relations_normal,relations_typing,isolated_entities,isolated_entities_typing,stage_0,stage_1,stage_2,stage_3,stage_4,stage_5,stage_6,total_runtime
0,Title: MEL: Metadata Extractor & Loader (by g),1165.0,182.0,302.0,514.0,293.0,221.0,54.0,12.0,1e-06,10.537639,9.771435,2.729663,7.18791,17.346162,0.002701,47.575512
1,Title: Modeling Actuations in BCI-O: A Context...,3161.0,356.0,757.0,1345.0,558.0,787.0,97.0,32.0,2e-06,22.989779,19.286901,7.299901,8.608724,25.711434,0.003705,83.900447
2,Title: Building An Open Source Linux Computing...,3062.0,525.0,809.0,1524.0,572.0,952.0,153.0,44.0,3e-06,21.460656,21.545553,6.725228,7.138557,31.637743,0.004423,88.512162
3,Title: TNNT: The Named Entity Recognition Tool...,1486.0,241.0,405.0,876.0,375.0,501.0,78.0,27.0,1e-06,11.205289,10.653919,3.075103,7.453258,17.613616,0.002295,50.003482
4,Title: A Pipeline For Analysing Grant Applicat...,5183.0,712.0,1428.0,2088.0,761.0,1327.0,209.0,46.0,4e-06,43.376656,53.727904,12.320806,10.142776,52.32871,0.006591,171.903447
5,Title: Active knowledge graph completion (by g),10529.0,1368.0,2883.0,4287.0,1484.0,2803.0,362.0,105.0,6e-06,69.37624,77.01784,25.812517,10.244763,82.968609,0.014973,265.434949
6,Title: An Analysis of Links in Wikidata (by g),7308.0,903.0,1790.0,3351.0,1033.0,2318.0,236.0,41.0,9e-06,51.100973,52.398132,16.968888,11.127803,155.61227,0.008639,287.216714
7,Title: BCI Ontology: A Context-based Sense and...,5103.0,917.0,1485.0,2779.0,963.0,1816.0,247.0,42.0,3e-06,48.545712,48.615474,15.988655,10.82413,60.485081,0.00822,184.467274
8,Title: HDGI: A Human Device Gesture Interactio...,5719.0,833.0,1464.0,2899.0,931.0,1968.0,206.0,45.0,1e-05,47.902377,42.72479,13.247571,10.304787,53.814917,0.007618,168.002069
9,Title: Learning SHACL shapes from knowledge gr...,12113.0,1609.0,3262.0,5004.0,1594.0,3410.0,467.0,95.0,1.3e-05,73.778494,494.519691,79.282216,59.755554,381.295926,0.015592,1088.647486
