[Open this notebook in Google Colab](https://colab.research.google.com/github/kellm-fit/ISWC_tutorial/blob/main/session-1/hands-on/papers_to_kg.ipynb)

## Step 1: Install Dependencies

In [1]:
!pip install -q pandas==2.2.3 rdflib==7.0.0 langchain-text-splitters==0.3.0


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Step 2: Define SPARQL Query Helper
Create a function to execute SPARQL queries on an RDF graph and return results as a DataFrame.

In [2]:
from rdflib import Graph
from rdflib.plugins.sparql import prepareQuery
import pandas as pd

def sparql_query(rdf_graph: Graph, query: str) -> pd.DataFrame:
    """
    Executes a SPARQL query on a pre-loaded RDF graph and returns the results as a DataFrame.

    The function dynamically infers the column names from the query results.

    Parameters:
        query (str): The SPARQL query to execute.

    Returns:
        pd.DataFrame: A DataFrame containing the results of the SPARQL query.
    """
    try:
        # Prepare and execute the query        
        query = prepareQuery(query)
        results = rdf_graph.query(query)
        
        # Extract variable (column) names from the query result
        columns = results.vars  # Get the variable names from the query results
        
        # Process the results and convert them into a list of dictionaries
        data = []
        for row in results:
            row_data = {str(var): row[var] for var in columns}  # Dynamically build a row dict
            data.append(row_data)
        
        # Convert the data into a DataFrame
        df = pd.DataFrame(data, columns=[str(var) for var in columns])
        return df

    except Exception as e:
        print(f"An error occurred while executing the SPARQL query: {e}")
        return pd.DataFrame()

## Step 3: Load Initial RDF Graph & Run Sample Query
Load a `.ttl` file and retrieve basic details like titles and authors using SPARQL.

In [3]:
g = Graph()
g.parse("dataset/dblp/abs-2401-07237.ttl", format="turtle")

query = """
PREFIX dblp: <https://dblp.org/rdf/schema#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?title ?authorName
WHERE {
    ?publication dblp:title ?title .
    ?publication dblp:hasSignature ?signature .
    ?signature dblp:signatureDblpName ?authorName .
}
"""

# Execute the query and display the results in a DataFrame
df = sparql_query(g, query)
df

Unnamed: 0,title,authorName
0,Distilling Event Sequence Knowledge From Large...,Somin Wadhwa
1,Distilling Event Sequence Knowledge From Large...,Oktie Hassanzadeh
2,Distilling Event Sequence Knowledge From Large...,Debarun Bhattacharjya
3,Distilling Event Sequence Knowledge From Large...,Ken Barker 0002
4,Distilling Event Sequence Knowledge From Large...,Jian Ni


## Step 4: Combine RDF Files
Merge multiple `.ttl` files into one RDF graph with consistent namespaces. Save as `iswc-papers.ttl`.

In [4]:
from rdflib import Graph, Namespace

# List of file paths for the Turtle files to combine
turtle_files = [
    "dataset/dblp/abs-2401-07237.ttl", # Distilling Event Sequence Knowledge From Large Language Models.
    "dataset/dblp/abs-2407-10430.ttl", # Expanding the Scope: Inductive Knowledge Graph Reasoning with Multi-Starting Progressive Propagation.
    "dataset/dblp/abs-2407-16127.ttl", # Finetuning Generative Large Language Models with Discrimination Instructions for Knowledge Graph Completion.
    "dataset/dblp/abs-2407-18752.ttl", # Knowledge Graph Structure as Prompt: Improving Small Language Models Capabilities for Knowledge-based Causal Discovery.
    "dataset/dblp/abs-2407-19998.ttl"  # Do LLMs Really Adapt to Domains? An Ontology Learning Perspective.
    ]

namespaces = {
    "xsd": Namespace("http://www.w3.org/2001/XMLSchema#"),
    "rdf": Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
    "rdfs": Namespace("http://www.w3.org/2000/01/rdf-schema#"),
    "owl": Namespace("http://www.w3.org/2002/07/owl#"),
    "bf": Namespace("http://id.loc.gov/ontologies/bibframe/"),
    "bibo": Namespace("http://purl.org/ontology/bibo/"),
    "bibtex": Namespace("http://purl.org/net/nknouf/ns/bibtex#"),
    "cito": Namespace("http://purl.org/spar/cito/"),
    "datacite": Namespace("http://purl.org/spar/datacite/"),
    "dbo": Namespace("http://dbpedia.org/ontology/"),
    "dc": Namespace("http://purl.org/dc/elements/1.1/"),
    "dct": Namespace("http://purl.org/dc/terms/"),
    "foaf": Namespace("http://xmlns.com/foaf/0.1/"),
    "litre": Namespace("http://purl.org/spar/literal/"),
    "locid": Namespace("http://id.loc.gov/vocabulary/identifiers/"),
    "locrel": Namespace("http://id.loc.gov/vocabulary/relators/"),
    "schema": Namespace("https://schema.org/"),
    "wd": Namespace("http://www.wikidata.org/entity/"),
    "wdt": Namespace("http://www.wikidata.org/prop/direct/"),
    "dblp": Namespace("https://dblp.org/rdf/schema#")
}

# Create a new RDF graph to hold the combined data
combined_graph = Graph()

for prefix, namespace in namespaces.items():
    combined_graph.bind(prefix, namespace)

# Load each file and add its content to the combined graph
for file_path in turtle_files:
    g = Graph()
    g.parse(file_path, format="turtle")
    combined_graph += g  

# Serialize the combined graph to a new TTL file
combined_graph.serialize("iswc-papers.ttl", format="turtle")

print("Combined TTL file created: iswc-papers.ttl")

Combined TTL file created: iswc-papers.ttl


## Step 5: Query Combined RDF Graph
Query the combined RDF file to retrieve paper metadata including title, authors, year, journal, and DOI.

In [5]:
# Load the RDF graph from the TTL file
g = Graph()
g.parse("iswc-papers.ttl", format="turtle")

query = """
PREFIX dblp: <https://dblp.org/rdf/schema#>
PREFIX litre: <http://purl.org/spar/literal/>
PREFIX datacite: <http://purl.org/spar/datacite/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?title ?authorName ?year ?journal ?doi
WHERE {
    # Get the title, year, and journal name
    ?publication dblp:title ?title .
    ?publication dblp:yearOfPublication ?year .
    ?publication dblp:publishedInJournal ?journal .

    # Get the DOI if it exists
    OPTIONAL {
        ?publication dblp:doi ?doi .
    }

    # Retrieve author names through their signatures
    ?publication dblp:hasSignature ?signature .
    ?signature dblp:signatureDblpName ?authorName .
}
ORDER BY ?title ?authorName
"""

# Execute the query and display the results in a DataFrame
df = sparql_query(g, query)
df

Unnamed: 0,title,authorName,year,journal,doi
0,Distilling Event Sequence Knowledge From Large...,Debarun Bhattacharjya,2024,CoRR,https://doi.org/10.48550/ARXIV.2401.07237
1,Distilling Event Sequence Knowledge From Large...,Jian Ni,2024,CoRR,https://doi.org/10.48550/ARXIV.2401.07237
2,Distilling Event Sequence Knowledge From Large...,Ken Barker 0002,2024,CoRR,https://doi.org/10.48550/ARXIV.2401.07237
3,Distilling Event Sequence Knowledge From Large...,Oktie Hassanzadeh,2024,CoRR,https://doi.org/10.48550/ARXIV.2401.07237
4,Distilling Event Sequence Knowledge From Large...,Somin Wadhwa,2024,CoRR,https://doi.org/10.48550/ARXIV.2401.07237
5,Do LLMs Really Adapt to Domains? An Ontology L...,Cuong Xuan Chu,2024,CoRR,https://doi.org/10.48550/ARXIV.2407.19998
6,Do LLMs Really Adapt to Domains? An Ontology L...,Heiko Paulheim,2024,CoRR,https://doi.org/10.48550/ARXIV.2407.19998
7,Do LLMs Really Adapt to Domains? An Ontology L...,Huu Tan Mai,2024,CoRR,https://doi.org/10.48550/ARXIV.2407.19998
8,Expanding the Scope: Inductive Knowledge Graph...,Wei Hu,2024,CoRR,https://doi.org/10.48550/ARXIV.2407.10430
9,Expanding the Scope: Inductive Knowledge Graph...,Yuanning Cui,2024,CoRR,https://doi.org/10.48550/ARXIV.2407.10430


## Step 6: Map Publications to Markdown Files and XML Files
Map each publication URI to its corresponding `.md` and `.xml` file for chunking.

In [6]:
query = """
PREFIX dblp: <https://dblp.org/rdf/schema#>
PREFIX datacite: <http://purl.org/spar/datacite/>

SELECT ?publication
WHERE {    
    ?publication a dblp:Publication .
}
"""
df = sparql_query(g, query)
df

Unnamed: 0,publication
0,https://dblp.org/rec/journals/corr/abs-2407-18752
1,https://dblp.org/rec/journals/corr/abs-2407-10430
2,https://dblp.org/rec/journals/corr/abs-2407-19998
3,https://dblp.org/rec/journals/corr/abs-2407-16127
4,https://dblp.org/rec/journals/corr/abs-2401-07237


In [7]:
file_dict = {
    row['publication']: {
        'md': "dataset/md/" + row['publication'].split('/')[-1] + ".md",
        'gorbid': "dataset/gorbid/" + row['publication'].split('/')[-1] + ".xml"
    }
    for _, row in df.iterrows()
}

file_dict

{rdflib.term.URIRef('https://dblp.org/rec/journals/corr/abs-2407-18752'): {'md': 'dataset/md/abs-2407-18752.md',
  'gorbid': 'dataset/gorbid/abs-2407-18752.xml'},
 rdflib.term.URIRef('https://dblp.org/rec/journals/corr/abs-2407-10430'): {'md': 'dataset/md/abs-2407-10430.md',
  'gorbid': 'dataset/gorbid/abs-2407-10430.xml'},
 rdflib.term.URIRef('https://dblp.org/rec/journals/corr/abs-2407-19998'): {'md': 'dataset/md/abs-2407-19998.md',
  'gorbid': 'dataset/gorbid/abs-2407-19998.xml'},
 rdflib.term.URIRef('https://dblp.org/rec/journals/corr/abs-2407-16127'): {'md': 'dataset/md/abs-2407-16127.md',
  'gorbid': 'dataset/gorbid/abs-2407-16127.xml'},
 rdflib.term.URIRef('https://dblp.org/rec/journals/corr/abs-2401-07237'): {'md': 'dataset/md/abs-2401-07237.md',
  'gorbid': 'dataset/gorbid/abs-2401-07237.xml'}}

## Step 7: Split, Add Chunks, and Save Updated RDF Graph
Use `langchain-text-splitters` to split content into chunks. For each paper, add each chunk as a node linked with `ex:hasChunk`, and sequentially link chunks with `ex:next`. Serialize the updated graph with chunk relationships into `iswc-papers-with-chunks.ttl`.


In [8]:
from rdflib import Graph, URIRef, Literal, Namespace
from langchain_text_splitters import RecursiveCharacterTextSplitter

g = Graph()
g.parse("iswc-papers.ttl", format="turtle")

# # Define namespaces for the custom relationship
SCHEMA = Namespace("http://schema.org/")
EX = Namespace("http://example.org/")  # Custom namespace
g.bind("ex", EX)
g.bind("schema", SCHEMA)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

for paper_uri, file in file_dict.items():
    with open(file["md"], encoding="utf-8") as f:
        content = f.read()
    texts = text_splitter.create_documents([content])    
    
    previous_chunk_uri = None
    # Add each chunk as a unique node and relate it to the paper
    for idx, chunk in enumerate(texts):
        # Relate chunk to paper
        chunk_content = chunk.page_content
        chunk_uri = URIRef(f"{paper_uri}/chunk{idx+1}")
                
        g.add((chunk_uri, SCHEMA.text, Literal(chunk_content)))
        g.add((chunk_uri, SCHEMA.isPartOf, paper_uri))

        # Add a "next" relationship to link sequential chunks
        if previous_chunk_uri:            
            g.add((previous_chunk_uri, EX.next, chunk_uri))

        previous_chunk_uri = chunk_uri

g.serialize("iswc-papers.ttl", format="turtle")
print("Updated TTL file with chunks created: iswc-papers.ttl")


Updated TTL file with chunks created: iswc-papers.ttl


## Step 8: Verify Chunk Relationships
Query the graph to verify the chunks and their contents.

In [9]:
query = """
PREFIX schema: <http://schema.org/>

SELECT ?paper ?chunk ?content
WHERE {
    ?chunk schema:isPartOf ?paper .
    ?chunk schema:text ?content .
}
"""
df = sparql_query(g, query)
df

Unnamed: 0,paper,chunk,content
0,https://dblp.org/rec/journals/corr/abs-2407-18752,https://dblp.org/rec/journals/corr/abs-2407-18...,# Knowledge Graph Structure as Prompt: Improvi...
1,https://dblp.org/rec/journals/corr/abs-2407-18752,https://dblp.org/rec/journals/corr/abs-2407-18...,**Keywords:** causal relation · language model...
2,https://dblp.org/rec/journals/corr/abs-2407-18752,https://dblp.org/rec/journals/corr/abs-2407-18...,| smoking |\n|----------|\n| genetics |\n| ...
3,https://dblp.org/rec/journals/corr/abs-2407-18752,https://dblp.org/rec/journals/corr/abs-2407-18...,"In this paper, we investigate the capabilities..."
4,https://dblp.org/rec/journals/corr/abs-2407-18752,https://dblp.org/rec/journals/corr/abs-2407-18...,"domain datasets, and further evaluate the perf..."
...,...,...,...
176,https://dblp.org/rec/journals/corr/abs-2401-07237,https://dblp.org/rec/journals/corr/abs-2401-07...,(Jul 2022). https://doi.org/10.18653/v1/2022.f...
177,https://dblp.org/rec/journals/corr/abs-2401-07237,https://dblp.org/rec/journals/corr/abs-2401-07...,"f""Question: what usually happens after {target..."
178,https://dblp.org/rec/journals/corr/abs-2401-07237,https://dblp.org/rec/journals/corr/abs-2401-07...,return prompt\n```\n\nListing 1.2. Iterative I...
179,https://dblp.org/rec/journals/corr/abs-2401-07237,https://dblp.org/rec/journals/corr/abs-2401-07...,[Submit]\n\nFig. 3. Annotation interface for c...


## Step 9: Enrich KG using Gorbid

In [10]:
from rdflib import Graph, URIRef, Literal, Namespace
import xml.etree.ElementTree as ET

# Initialize the graph and load the existing TTL file with chunks
g = Graph()
g.parse("iswc-papers-with-chunks.ttl", format="turtle")

SCHEMA = Namespace("http://schema.org/")
g.bind("schema", SCHEMA)

BIBO = Namespace("http://purl.org/ontology/bibo/")
DCTERMS = Namespace("http://purl.org/dc/terms/")
g.bind("bibo", BIBO)
g.bind("dcterms", DCTERMS)

# Loop through each paper URI in file_dict and add abstract and keywords
for paper_uri, file in file_dict.items():
    # Process XML file to extract abstract and keywords
    file_path = file["gorbid"]  # Path to the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Define the TEI namespace
    namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Extract and add abstract
    abstract = root.find('.//tei:abstract/tei:p', namespace)
    abstract_text = abstract.text if abstract is not None else "No abstract found."    
    g.add((paper_uri, SCHEMA.abstract, Literal(abstract_text)))

    # Extract and add keywords
    keywords_elements = root.findall('.//tei:keywords/tei:term', namespace)
    keywords = [term.text for term in keywords_elements if term is not None]
    for keyword in keywords:        
        g.add((paper_uri, SCHEMA.keywords, Literal(keyword)))

# Serialize the updated graph to a new TTL file
g.serialize("iswc-papers.ttl", format="turtle")
print("Updated TTL file with abstract and keywords added: iswc-papers.ttl")


Updated TTL file with abstract and keywords added: iswc-papers.ttl


In [11]:
query = """
PREFIX schema: <http://schema.org/>

SELECT ?paper ?abstract (GROUP_CONCAT(?keyword; separator=", ") AS ?keywords)
WHERE {
    ?paper schema:abstract ?abstract .
    OPTIONAL { ?paper schema:keywords ?keyword }
}
GROUP BY ?paper ?abstract
ORDER BY ?paper
"""
df = sparql_query(g, query)
df

Unnamed: 0,paper,abstract,keywords
0,https://dblp.org/rec/journals/corr/abs-2401-07237,Event sequence models have been found to be hi...,"Knowledge Graphs, Large Language Models, Knowl..."
1,https://dblp.org/rec/journals/corr/abs-2407-10430,Knowledge graphs (KGs) are widely acknowledged...,"Knowledge graphs, Inductive reasoning, Conditi..."
2,https://dblp.org/rec/journals/corr/abs-2407-16127,Traditional knowledge graph (KG) completion mo...,"Knowledge graph completion, Large language mod..."
3,https://dblp.org/rec/journals/corr/abs-2407-18752,Causal discovery aims to estimate causal struc...,"causal relation, language model, knowledge graph"
4,https://dblp.org/rec/journals/corr/abs-2407-19998,Large Language Models (LLMs) have demonstrated...,"ontology learning, LLMs, domain adaptation"
