### (Stage 0) Metadata Knowledge Graph Construction
Input: `LLM-KGC_input.json`

Output: `LLM-KGC_input.ttl`

In [103]:
import hashlib
import json
from rdflib import Graph, Namespace, Literal, RDF, RDFS, URIRef
from rdflib.namespace import DC, OWL, XSD, SKOS
import time

# Open and read the JSON file
with open('kg.json', 'r') as file:
    paper = json.load(file)

start_time = time.time()    

### Ontology Creation

In [104]:
# Define namespaces
ASKG_DATA = Namespace("https://www.anu.edu.au/data/scholarly/")
ASKG_ONTO = Namespace("https://www.anu.edu.au/onto/scholarly#")
# WD = Namespace("http://www.wikidata.org/entity/")
# DOMO = Namespace("https://www.anu.edu.au/onto/domo#")
SCHEMA = Namespace("http://schema.org/")

# Create a new RDF graph
g = Graph()

# Bind namespaces to prefixes
g.bind("askg-data", ASKG_DATA)
g.bind("askg-onto", ASKG_ONTO)
g.bind("dc", DC)
g.bind("owl", OWL)
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
# g.bind("wd", WD) # wd:entity
# g.bind("domo", DOMO) # domo:keyword
g.bind("skos", SKOS) # skos:broader
g.bind("xsd", XSD) # ^^xsd:string
g.bind("schema", SCHEMA) # schema:text

# # Define RDF classes
# g.add((ASKG_ONTO.Paper, RDF.type, OWL.Class))
# g.add((ASKG_ONTO.Section, RDF.type, OWL.Class))
# g.add((ASKG_ONTO.Paragraph, RDF.type, OWL.Class))
# g.add((ASKG_ONTO.Sentence, RDF.type, OWL.Class))


# # Define RDF properties
# g.add((ASKG_ONTO.hasAuthor, RDF.type, RDF.Property))
# g.add((ASKG_ONTO.hasKeyword, RDF.type, RDF.Property))
# g.add((ASKG_ONTO.hasSection, RDF.type, RDF.Property))
# g.add((ASKG_ONTO.hasParagraph, RDF.type, RDF.Property))
# g.add((ASKG_ONTO.hasSentence, RDF.type, RDF.Property))
# g.add((ASKG_ONTO.hasText, RDF.type, RDF.Property))

# g.add((SCHEMA.text, RDF.type, RDF.Property))

### From JSON to RDF

In [105]:

# Paper
paper_iri = ASKG_DATA[paper["iri"]]
# paper_iri = ASKG_DATA["Paper-1"]


def get_iri(local_iri):
    global paper_iri
    return paper_iri + "-" + local_iri



g.add((paper_iri, RDF.type, ASKG_ONTO.Paper))
g.add((paper_iri, DC.title, Literal(paper["title"], lang="en")))

for author in paper["authors"]:
    g.add((paper_iri, ASKG_ONTO.hasAuthor, Literal(author, lang="en")))
for keyword in paper["keywords"]:
    g.add((paper_iri, ASKG_ONTO.hasKeyword, Literal(keyword, lang="en")))

for section in paper["sections"]:
    section_iri = get_iri(section["iri"])
    g.add((section_iri, RDF.type, ASKG_ONTO.Section))
    g.add((section_iri, DC.title, Literal(section["subtitle"], lang="en"))) 

    for paragraph in section["paragraphs"]:
        paragraph_iri = get_iri(paragraph["iri"])
        g.add((paragraph_iri, RDF.type, ASKG_ONTO.Paragraph))

        for sentence in paragraph["sentences"]:
            sentence_iri = get_iri(sentence["iri"])
            g.add((sentence_iri, RDF.type, ASKG_ONTO.Sentence))
            g.add((sentence_iri, ASKG_ONTO.hasText, Literal(sentence["text"], lang="en")))

            g.add((paragraph_iri, ASKG_ONTO.hasSentence, sentence_iri))

        g.add((section_iri, ASKG_ONTO.hasParagraph, paragraph_iri))
    
    g.add((paper_iri, ASKG_ONTO.hasSection, section_iri))


g.add((ASKG_ONTO.NamedEntity, RDFS.subClassOf, ASKG_ONTO.Entity))
g.add((ASKG_ONTO.GeneralTerm, RDFS.subClassOf, ASKG_ONTO.Entity))
g.add((ASKG_ONTO.OtherEntity, RDFS.subClassOf, ASKG_ONTO.Entity))



# Entities
for entity_iri, entity in paper["nodes"].items():
    entity_iri = get_iri(entity_iri)
    
    g.add((entity_iri, RDFS.label, Literal(entity["label"], lang="en")))

    for types in entity["types"]:
        g.add((entity_iri, ASKG_ONTO.hasTypeLabel, Literal(types, lang="en")))
    for alias in entity["aliases"]:
        g.add((entity_iri, SKOS.altLabel, Literal(alias, lang="en")))

    g.add((entity_iri, SCHEMA.description, Literal(entity["description"], lang="en")))

    if entity["node_type"] == "named entity":
        g.add((entity_iri, RDF.type, ASKG_ONTO.NamedEntity))
    elif entity["node_type"] == "general term":
        g.add((entity_iri, RDF.type, ASKG_ONTO.GeneralTerm))
    elif entity["node_type"] == "other":
        g.add((entity_iri, RDF.type, ASKG_ONTO.OtherEntity))
    else:
        raise ValueError("Unknown entity type")
    
    g.add((entity_iri, ASKG_ONTO.hasRelevanceScore, Literal(entity["relevance"], datatype=XSD.float)))

    for mention in entity["mentions"]:
        mention_iri = get_iri(mention["iri"])
        g.add((mention_iri, RDF.type, ASKG_ONTO.Mention))
        g.add((mention_iri, RDFS.label, Literal(mention["local_name"], lang="en")))
        sentence_mentioned_in = paper_iri + "-" + mention["reference"]
        g.add((mention_iri, ASKG_ONTO.mentionedIn, sentence_mentioned_in))
        g.add((entity_iri, ASKG_ONTO.hasMention, mention_iri))



# Predicates
for predicate_iri, predicate in paper["predicates"].items():
    predicate_iri = get_iri(predicate_iri)
    g.add((predicate_iri, RDF.type, ASKG_ONTO.Predicate))
    g.add((predicate_iri, RDFS.label, Literal(predicate["label"], lang="en")))
    g.add((predicate_iri, SCHEMA.description, Literal(predicate["description"], lang="en")))


# Relations

for triple in paper["triples"]:
    subject_iri = paper_iri + "-" + triple[0]
    predicate_iri = paper_iri + "-" + triple[1]
    object_iri = paper_iri + "-" + triple[2]

    g.add((subject_iri, predicate_iri, object_iri))


for triple in paper["triples_typing"]:
    subject_iri = paper_iri + "-" + triple[0]
    predicate_iri = SKOS.broader
    object_iri = paper_iri + "-" + triple[2]

    g.add((subject_iri, predicate_iri, object_iri))

    
    


with open('kg.ttl', 'wb') as f:
    f.write(g.serialize(format='turtle').encode("utf-8"))

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.2165839672088623 seconds ---


### RDF Output