In [130]:
pip install arxiv langchain langchain_experimental langchain_openai tiktoken intersystems-irispython sqlalchemy-iris sentence_transformers gradio


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gradio
  Downloading gradio-5.46.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting audioop-lts<1.0 (from gradio)
  Downloading audioop_lts-0.2.2-cp313-abi3-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (5.5 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.117.1-py3-none-any.whl.metadata (28 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.6.1-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.13.1 (from gradio)
  Downloading gradio_client-1.13.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Using cached groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from 

In [4]:
import arxiv
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import ast

In [53]:
#search any topic and replace the keyword below
search_query = (
    "GraphRAG OR RAG OR 'knowledge graph' OR 'graph-based retrieval' OR 'graph reasoning' "
    
)
max_results = 50

# Fetch papers from arXiv
client = arxiv.Client()
search = arxiv.Search(
    query=search_query, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance
)

docs = []
for result in client.results(search):
    docs.append(
        {"title": result.title, "abstract": result.summary, "url": result.entry_id,  "published": result.published.date().isoformat(),
         "authors": result.authors
        }
    )

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000, chunk_overlap=50
)
doc_splits = text_splitter.create_documents(
    [doc["abstract"]+" "+doc["title"]+""+str(doc["authors"]) for doc in docs], metadatas=docs
)

docs_to_print = docs[:3]

print(f"Number of papers: {len(docs)}")
print(f"Number of chunks: {len(doc_splits)}") 
for i, doc in enumerate(docs_to_print, start=1):
    authors_str = ", ".join([str(author) for author in doc['authors']]) 
    print(f"Paper {i}:")
    print(f"Title: {doc['title']}")
    print(f"Abstract: {doc['abstract']}")
    print(f"URL: {doc['url']}")
    print(f"Published: {doc['published']}")

    print(f"Authors: {authors_str}")  
    print("-" * 50)  

   

In [58]:
import re

df = pd.DataFrame(docs)
df['docid'] = range(len(df))

df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(str(x)) if isinstance(x, str) else x)

# Remove special characters from author names and join authors with commas
def clean_author_name(name):
    if isinstance(name, str):
        cleaned_name = re.sub(r'[^a-zA-Z\s]', '', name)
        return cleaned_name.strip()  # Ensure no leading/trailing spaces
    return str(name)  

df['authors'] = df['authors'].apply(lambda x: ", ".join([clean_author_name(str(author)) for author in x]))

# print(df[['docid', 'title', 'authors']])

df = df[['docid', 'title', 'abstract', 'url', 'published', 'authors']]

output_csv_path = "/Users/fji/Projects/iris-global-graphrag/data/papers.csv"
df.to_csv(output_csv_path, index=False, header=True)

In [19]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
import os
import getpass
from dotenv import load_dotenv

load_dotenv(override=True)

if "OPENAI_API_KEY" in os.environ:
    os.environ.pop("OPENAI_API_KEY")

if not os.environ.get("OPENAI_API_KEY"): 
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key: ········


In [14]:
graph_llm = ChatOpenAI(model_name="gpt-5")
graph_transformer = LLMGraphTransformer(
    llm=graph_llm,
    allowed_nodes=["Paper", "Author", "Topic"],
    node_properties=["title", "abstract", "url", "author", "published"],
    allowed_relationships=[ "COVERS", "INCLUDES","RELATED_TO","AUTHORED"],
)

graph_documents = graph_transformer.convert_to_graph_documents(doc_splits)

print(f"Graph documents: {len(graph_documents)}")
print(f"Nodes from 1st graph doc:{graph_documents[0].nodes}")

Graph documents: 50
Nodes from 1st graph doc:[Node(id='When To Use Graphs In Rag: A Comprehensive Analysis For Graph Retrieval-Augmented Generation', type='Paper', properties={'title': 'When to use Graphs in RAG: A Comprehensive Analysis for Graph Retrieval-Augmented Generation', 'abstract': 'Graph retrieval-augmented generation (GraphRAG) has emerged as a powerful paradigm for enhancing large language models (LLMs) with external knowledge. It leverages graphs to model the hierarchical structure between specific concepts, enabling more coherent and effective knowledge retrieval for accurate reasoning. Despite its conceptual promise, recent studies report that GraphRAG frequently underperforms vanilla RAG on many real-world tasks. This raises a critical question: Is GraphRAG really effective, and in which scenarios do graph structures provide measurable benefits for RAG systems? To address this, we propose GraphRAG-Bench, a comprehensive benchmark designed to evaluate GraphRAG models on

In [20]:
def canonicalize_authored(graph_documents):
    for doc in graph_documents:
        for rel in getattr(doc, "relationships", []) or []:
            if (getattr(rel, "type", "") or "").upper() != "AUTHORED":
                continue
            s, t = rel.source, rel.target
            if getattr(s, "type", None) == "Paper" and getattr(t, "type", None) == "Author":
                rel.source, rel.target = t, s  # flip to Author -> Paper
    return graph_documents

graph_documents = canonicalize_authored(graph_documents)

In [21]:
import csv

data_path = '/Users/fji/Projects/iris-global-graphrag/data/'
filename = data_path + "entities" + ".csv"

# Open the file in write mode with 'newline=""' to avoid extra blank lines
with open(filename, "w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)  # Create a CSV writer object
    writer.writerow(["docid", "entityid", "type"])  # Write the header row

    # Loop through the graph_documents
    for i, doc in enumerate(graph_documents):
        if hasattr(doc, 'nodes') and isinstance(doc.nodes, list):  # Ensure 'nodes' is a list
            for node in doc.nodes:
                try:
                    # Check if the 'id' and 'type' attributes exist in the node
                    if hasattr(node, 'id') and hasattr(node, 'type'):
                        # Write the data to the CSV file, split into three columns
                        writer.writerow([i, node.id, node.type])
                except UnicodeEncodeError:
                    # Handle UnicodeEncodeError if there are problematic characters
                    continue

In [22]:
data_path = '/Users/fji/Projects/iris-global-graphrag/data/'
filename = data_path + "relations" + ".csv"

# Open the file in write mode with 'newline=""' to avoid extra blank lines
with open(filename, "w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)  # Create a CSV writer object
    writer.writerow(["docid", "source", "sourcetype", "target", "targettype", "type"])  # Write the header row

    # Loop through the graph_documents
    for i, doc in enumerate(graph_documents):
        # Check if the document has relationships
        if hasattr(doc, 'relationships') and isinstance(doc.relationships, list):
            # print(f"Processing document {i}, relationships found.")  # Debugging: Confirm relationships exist
            for relation in doc.relationships:
                try:
                    # Extract the relevant data from the relationship
                    source = relation.source
                    target = relation.target

                    # Check if the necessary attributes are present
                    if hasattr(source, 'id') and hasattr(source, 'type') and hasattr(target, 'id') and hasattr(target, 'type'):
                        # Write the data to the CSV file
                        writer.writerow([i, source.id, source.type, target.id, target.type, relation.type])
                    else:
                        print(f"  Missing attributes in relation: {relation}")
                except Exception as e:
                    print(f"  Error processing relation: {e}")
                    continue

## Setup IRIS

In [214]:
import iris
from sqlalchemy import create_engine,text

args = {
	'hostname':'localhost', 
	'port': 1972,
	'namespace':'USER', 
	'username':'_SYSTEM', 
	'password':'SYS',
    'logfile':'log.txt'
    
}
conn = iris.connect(**args)

url = "iris://_SYSTEM:SYS@localhost:1972/USER"
engine = create_engine(url)

irispy = iris.createIRIS(conn)

In [63]:
# irispy.kill("GraphRelations")

In [206]:
with open("/Users/fji/Projects/iris-global-graphrag/data/papers.csv", newline='',encoding='utf-8-sig') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    docid = row['docid']
                    title = row['title']
                    abstract = row['abstract']
                    url = row['url']
                    published = row['published']
                    authors = row['authors']
                    irispy.set(title,    "GraphContent", docid, "title")
                    irispy.set(abstract, "GraphContent", docid, "abstract")
                    irispy.set(url,      "GraphContent", docid, "url")
                    irispy.set(published,      "GraphContent", docid, "published")

                    irispy.set(authors,  "GraphContent", docid, "authors")


In [207]:
with open("/Users/fji/Projects/iris-global-graphrag/data/relations.csv", newline='') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    docid = row ['docid']
                    source = row['source']
                    source_type = row['sourcetype']
                    target = row['target']
                    target_type = row['targettype']
                    relation = row['type']
         
                    irispy.set(source_type, "GraphRelations",docid, "Node", source)
                    irispy.set(target_type, "GraphRelations",docid, "Node", target)
                    irispy.set(relation, "GraphRelations",docid, "Edge", source, target)

                   

In [208]:

df = pd.read_csv("/Users/fji/Projects/iris-global-graphrag/data/papers.csv")

df.head(3)

Unnamed: 0,docid,title,abstract,url,published,authors
0,0,When to use Graphs in RAG: A Comprehensive Ana...,Graph retrieval-augmented generation (GraphRAG...,http://arxiv.org/abs/2506.05690v1,2025-06-06,"Zhishang Xiang, Chuanjie Wu, Qinggang Zhang, S..."
1,1,"Benchmarking Vector, Graph and Hybrid Retrieva...",Generative AI (GenAI) is expected to play a pi...,http://arxiv.org/abs/2507.03608v2,2025-07-04,"Sarat Ahmad, Zeinab Nezami, Maryam Hafeez, Sye..."
2,2,Graph-R1: Towards Agentic GraphRAG Framework v...,Retrieval-Augmented Generation (RAG) mitigates...,http://arxiv.org/abs/2507.21892v1,2025-07-29,"Haoran Luo, Haihong E, Guanting Chen, Qika Lin..."


In [209]:
df["combined"] = df.apply(
    lambda r: f"docid: {r['docid']} | title: {r['title']} | abstract: {r['abstract']} | "
              f"url: {r['url']} | published: {r['published']} | authors: {r['authors']}",
    axis=1
)
df.head(3)

Unnamed: 0,docid,title,abstract,url,published,authors,combined
0,0,When to use Graphs in RAG: A Comprehensive Ana...,Graph retrieval-augmented generation (GraphRAG...,http://arxiv.org/abs/2506.05690v1,2025-06-06,"Zhishang Xiang, Chuanjie Wu, Qinggang Zhang, S...",docid: 0 | title: When to use Graphs in RAG: A...
1,1,"Benchmarking Vector, Graph and Hybrid Retrieva...",Generative AI (GenAI) is expected to play a pi...,http://arxiv.org/abs/2507.03608v2,2025-07-04,"Sarat Ahmad, Zeinab Nezami, Maryam Hafeez, Sye...","docid: 1 | title: Benchmarking Vector, Graph a..."
2,2,Graph-R1: Towards Agentic GraphRAG Framework v...,Retrieval-Augmented Generation (RAG) mitigates...,http://arxiv.org/abs/2507.21892v1,2025-07-29,"Haoran Luo, Haihong E, Guanting Chen, Qika Lin...",docid: 2 | title: Graph-R1: Towards Agentic Gr...


In [210]:
df.to_csv("/Users/fji/Projects/iris-global-graphrag/data/papers_combined.csv", 
          index=False, encoding="utf-8-sig")

In [200]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = f"""
                drop TABLE paper_content 
                """
        result = conn.execute(text(sql))

In [201]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = f"""
                CREATE TABLE IF NOT EXISTS paper_content (
                    docid VARCHAR(255),
                    title VARCHAR(255),
                    abstract VARCHAR(2000),
                    url VARCHAR(255),
                    published VARCHAR(255),
                    authors VARCHAR(255),
                    combined VARCHAR(10000),
                    paper_vector VECTOR(FLOAT, 384)

                )
                """
        result = conn.execute(text(sql))

In [202]:
from sentence_transformers import SentenceTransformer

emb_model = SentenceTransformer('all-MiniLM-L6-v2') 
embeddings = emb_model.encode(df['combined'].tolist(), normalize_embeddings=True)

# Add the embeddings to the DataFrame
df['paper_vector'] = embeddings.tolist()

df.head()

Unnamed: 0,docid,title,abstract,url,published,authors,combined,paper_vector
0,0,When to use Graphs in RAG: A Comprehensive Ana...,Graph retrieval-augmented generation (GraphRAG...,http://arxiv.org/abs/2506.05690v1,2025-06-06,"Zhishang Xiang, Chuanjie Wu, Qinggang Zhang, S...",docid: 0 | title: When to use Graphs in RAG: A...,"[-0.05978081375360489, 0.006263501010835171, 0..."
1,1,"Benchmarking Vector, Graph and Hybrid Retrieva...",Generative AI (GenAI) is expected to play a pi...,http://arxiv.org/abs/2507.03608v2,2025-07-04,"Sarat Ahmad, Zeinab Nezami, Maryam Hafeez, Sye...","docid: 1 | title: Benchmarking Vector, Graph a...","[-0.08074361085891724, -0.01096606906503439, -..."
2,2,Graph-R1: Towards Agentic GraphRAG Framework v...,Retrieval-Augmented Generation (RAG) mitigates...,http://arxiv.org/abs/2507.21892v1,2025-07-29,"Haoran Luo, Haihong E, Guanting Chen, Qika Lin...",docid: 2 | title: Graph-R1: Towards Agentic Gr...,"[-0.04313172027468681, 0.05216117948293686, -0..."
3,3,RAG vs. GraphRAG: A Systematic Evaluation and ...,Retrieval-Augmented Generation (RAG) enhances ...,http://arxiv.org/abs/2502.11371v1,2025-02-17,"Haoyu Han, Harry Shomer, Yu Wang, Yongjia Lei,...",docid: 3 | title: RAG vs. GraphRAG: A Systemat...,"[-0.06944005936384201, 0.07515047490596771, 0...."
4,4,Empowering GraphRAG with Knowledge Filtering a...,"In recent years, large language models (LLMs) ...",http://arxiv.org/abs/2503.13804v1,2025-03-18,"Kai Guo, Harry Shomer, Shenglai Zeng, Haoyu Ha...",docid: 4 | title: Empowering GraphRAG with Kno...,"[-0.052577532827854156, 0.01960759237408638, -..."


In [203]:
with engine.connect() as conn:
    with conn.begin():
        for index, row in df.iterrows():
            sql = text("""
                INSERT INTO paper_content 
                (docid, title, abstract, url, published, authors, combined, paper_vector) 
                VALUES (:docid, :title, :abstract, :url, :published, :authors, :combined, TO_VECTOR(:paper_vector))
            """)
            conn.execute(sql, {
                'docid': row['docid'], 
                'title': row['title'],
                'abstract': row['abstract'],
                'url': row['url'],
                'published': row['published'], 
                'authors': row['authors'], 
                'combined': row['combined'],
                'paper_vector': str(row['paper_vector'])
            })

In [204]:
with engine.connect() as conn:
    with conn.begin():# Load 
        sql = f"""
               CREATE INDEX HNSWIndex ON TABLE paper_content (paper_vector) AS HNSW(Distance='DotProduct')
                """
        result = conn.execute(text(sql))

In [176]:
from sqlalchemy import text

def search_papers(engine, search_vector, top_k):
    sql = text(f"""
        SELECT TOP {top_k} combined
        FROM paper_content
        ORDER BY VECTOR_DOT_PRODUCT(paper_vector, TO_VECTOR(:search_vector)) DESC
    """)
    with engine.connect() as conn:
        with conn.begin():
            rows = conn.execute(sql, {"search_vector": str(search_vector)}).fetchall()

    # Flatten 1-element tuples into a list of strings
    flattened = [r[0] for r in rows]
    return flattened

# test

# search_query = "who has written the most paper"
# search_vector = emb_model.encode(search_query, normalize_embeddings=True).tolist() # Convert search phrase into a vector
# results = search_papers(engine,search_vector,5)
# print(results)

In [85]:
from openai import OpenAI

model = "gpt-4o-mini"
client = OpenAI()

def send_to_llm(model, messages,**kwargs):
    
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
        **kwargs


    )
    return completion


## RAG

In [87]:
def llm_answer_rag(batch, query, cutoff=True):
 

    prompt_text = """You are an expert assistant for graph-based academic search. 
    You are given a graph context of academic papers including authors, abstracts, published date.
    Use the following pieces of retrieved context from the database to answer the question.
    """ + (("Use three sentences maximum and keep the answer concise.") if cutoff else " ") + """
    Question: {question}  
    Context: {context}
    Answer:
    """


    prompt = prompt_text.format(**{"question": query, "context": batch})
 
    messages = [
            {
                "role": "user",
                "content": prompt
            }
        ]
    
    completion = send_to_llm(model, messages)
    response = completion.choices[0].message.content
 
    answer_lines = [line.strip() for line in response.split('\n') if line.strip()]



    return answer_lines

In [93]:
def ask_question_rag(query: str, engine, emb_model, top_k: int = 5):

    search_vector = emb_model.encode(query, normalize_embeddings=True).tolist()

    results = search_papers(engine, search_vector, top_k)

    response = llm_answer_rag(results, query, True)
    if isinstance(response, list):
        response = " ".join(response)
    return response



In [94]:
response = ask_question_rag(search_query, engine, emb_model, top_k=5)
print(response)

GraphRAG is a framework that enhances retrieval-augmented generation (RAG) by leveraging the structural information of graphs to improve the accuracy and context of responses generated by large language models (LLMs). It addresses challenges such as knowledge gaps and hallucinations by integrating structured knowledge from external graphs. Recent studies propose various enhancements and modular frameworks to optimize GraphRAG's performance across different domains.


## GraphRAG

In [96]:
def get_graph_for_doc(doc_id: int, iris_handle, global_name="^GraphRelations"):

    nodes = []
    for name, node_type in iris_handle.iterator(global_name, doc_id, "Node"):
        nodes.append({"name": name, "type": node_type})


    edges = []
    for src, _ in iris_handle.iterator(global_name, doc_id, "Edge"):
        for dst, rel in iris_handle.iterator(global_name, doc_id, "Edge", src):
            edges.append({"source": src, "target": dst, "relation": rel})

    return {"doc_id": doc_id, "nodes": nodes, "edges": edges}

# # test
# graph = get_graph_for_doc(0, irispy)

# print("Doc ID:", graph["doc_id"])
# print("\nNodes:")
# for n in graph["nodes"]:
#     print(f"  - {n['name']} ({n['type']})")

# print("\nEdges:")
# for e in graph["edges"]:
#     print(f"  - {e['source']} --[{e['relation']}]-> {e['target']}")

In [97]:

def search_papers_id(engine, search_vector,top_k):

    sql = text(f"""
         SELECT TOP {top_k} docid FROM paper_content ORDER BY VECTOR_DOT_PRODUCT(paper_vector, TO_VECTOR(:search_vector)) DESC
    """)
    with engine.connect() as conn:
        with conn.begin():
            resultsID = conn.execute(sql, {"search_vector": str(search_vector)}).fetchall()
    results = [row[0] for row in resultsID]
    return results
    

# # # test
# search_query = "What is Knowledge graph"
# search_vector = emb_model.encode(search_query, normalize_embeddings=True).tolist() 
# results = search_papers_id(engine, search_vector,5)
# print(results)

In [98]:
def get_graphs_for_docs(doc_ids, iris_handle, global_name="^GraphRelations"):
    graphs = []
    for doc_id in doc_ids:
        # ensure int (in case SQL returns strings)
        gid = int(doc_id)
        graphs.append(get_graph_for_doc(gid, iris_handle, global_name))
    return graphs

 
doc_ids = search_papers_id(engine, search_vector, top_k=5)
# #test
# print("doc_ids:", doc_ids)

graphs = get_graphs_for_docs(doc_ids, irispy)
# #test
# print(graphs[:1]) 
# print(json.dumps(graphs[:2], ensure_ascii=False, indent=2))  

In [99]:
def get_content_for_docs(doc_ids, irispy, global_name="^GraphContent"):

    results = []
    fields = ["title", "abstract", "authors", "published", "url"]

    for doc_id in doc_ids:
        doc_data = {"doc_id": int(doc_id)}
        for field in fields:
            value = irispy.get(global_name, doc_id, field)
            if value is not None:
                doc_data[field] = str(value)
        results.append(doc_data)

    return results

# #test
# results = get_content_for_docs(doc_ids, irispy)
# print(results[:10])
# print(json.dumps(results, ensure_ascii=False, indent=2))

In [100]:
def llm_answer_graphrag(batch, query, cutoff=True):
 

    prompt_text = """You are an expert assistant for graph-based academic search. 
    You are given a graph context of academic papers including authors, abstracts, published date.
    Use the following pieces of retrieved context from a graph database to answer the question.
    """ + (("Use three sentences maximum and keep the answer concise.") if cutoff else " ") + """
    Question: {question}  
    Graph Context: {graph_context}
    Answer:
    """


    prompt = prompt_text.format(**{"question": query, "graph_context": batch})
 
    messages = [
            {
                "role": "user",
                "content": prompt
            }
        ]
    
    completion = send_to_llm(model, messages)
    response = completion.choices[0].message.content
 
    answer_lines = [line.strip() for line in response.split('\n') if line.strip()]



    return answer_lines

In [121]:
def prepare_combined_results(query, engine, emb_model, irispy, top_k=5):
    search_vector = emb_model.encode(query, normalize_embeddings=True).tolist()
    doc_ids = search_papers_id(engine, search_vector, top_k=top_k)
    return {
        "papers": get_content_for_docs(doc_ids, irispy),
        "graphs": get_graphs_for_docs(doc_ids, irispy)
    }

In [122]:
def ask_question_graphrag(query: str, engine, emb_model, irispy, top_k: int = 5):
    # search_vector = emb_model.encode(query, normalize_embeddings=True).tolist()

    # doc_ids = search_papers_id(engine, search_vector, top_k=top_k)

    # results_graph = get_graphs_for_docs(doc_ids, irispy)
    # results_paper = get_content_for_docs(doc_ids, irispy)

    # combined_results = {
    #     "papers": results_paper,
    #     "graphs": results_graph
    # }
    combined_results = prepare_combined_results

    response = llm_answer_graphrag(combined_results, query, True)
    if isinstance(response, list):
            response = " ".join(response)
    return response

# #test
# print(doc_ids)
# results = json.dumps(combined_results, ensure_ascii=False, indent=2)
# print(results[:10000])  

In [106]:
response = ask_question_graphrag("what is graphrag", engine, emb_model, irispy)
print(response)

GraphRAG is a framework that combines retrieval-augmented generation (RAG) with graph-structured data to enhance the accuracy and contextual relevance of responses generated by large language models (LLMs). It leverages the relational information in knowledge graphs to improve the precision of information retrieval and generation tasks. Recent studies have proposed modular frameworks and techniques to address challenges in implementing GraphRAG effectively across various domains.


## Agent

In [107]:
def get_papers_by_author(
    author: str,
    iris_handle,
    relations_global="^GraphRelations",
    content_global="^GraphContent",
    include_content=True,
    case_insensitive=True,
):
    def _eq(a, b):
        return a.lower() == b.lower() if case_insensitive else a == b

    results = []

    # iterate all docIds at the top level
    for doc_id, _ in iris_handle.iterator(relations_global):
        # iterate edge sources for this doc
        for src, _ in iris_handle.iterator(relations_global, doc_id, "Edge"):
            if not _eq(str(src), author):
                continue
            # iterate destinations; filter on relation == "AUTHORED"
            for dst, rel in iris_handle.iterator(relations_global, doc_id, "Edge", src):
                if str(rel).upper() != "AUTHORED":
                    continue

                item = {
                    "doc_id": int(doc_id),
                    "author": str(src),
                    "title": str(dst)
                }

                if include_content:
                    # pull extra fields from ^GraphContent
                    for f in ("title", "abstract", "authors", "published", "url"):
                        val = iris_handle.get(content_global, doc_id, f)
                        if val is not None:
                            item[f] = str(val)

                results.append(item)

    return results
#test
# papers = get_papers_by_author("TEST Name", irispy)

# print(json.dumps(papers, ensure_ascii=False, indent=2))

In [108]:
def get_papers_by_topic(
    irispy,
    topic,
    relations_global="^GraphRelations",
    content_global="^GraphContent",
    edge_root="Edge",
    require_value="COVERS",
    case_insensitive=True,
    include_content=True   # just add this

):


    def _eq(a, b):
        return a.lower() == b.lower() if case_insensitive else a == b

    results = []

    # iterate all doc_ids
    for doc_id, _ in irispy.iterator(relations_global):
        for paper, _ in irispy.iterator(relations_global, doc_id, edge_root):
            for dst_topic, relation in irispy.iterator(relations_global, doc_id, edge_root, paper):
                if not _eq(str(dst_topic), topic):
                    continue
                if require_value and str(relation).upper() != str(require_value).upper():
                    continue

                # start with relation info
                item = {
                    "doc_id": int(doc_id),
                    "paper": str(paper),
                    "topic": str(dst_topic),
                    "relation": str(relation),
                }

                # add paper metadata if available
                for f in ("title", "abstract", "authors", "published", "url"):
                    v = irispy.get(content_global, doc_id, f)
                    if v is not None:
                        item[f] = str(v)

                results.append(item)

    return results


# # ? it can only find exact match
# #test
# hits = get_papers_by_topic(irispy, "Knowledge Graphs")

# print(json.dumps(hits, ensure_ascii=False, indent=2))

In [110]:
def get_top_authors_by_paper_count(
    irispy,
    limit: int = 10,
    relations_global: str = "^GraphRelations",
    content_global: str = "^GraphContent",
    edge_root: str = "Edge",
    authored_value: str = "AUTHORED",
    case_insensitive: bool = True,
    dedup: bool = True,
):

    counts = {}            # key -> {"author": display, "count": int, "papers": [ ... ]}
    seen_pairs = {}        # key -> set of (doc_id, paper_node) to avoid double-counting

    for doc_id, _ in irispy.iterator(relations_global):
        for author, _ in irispy.iterator(relations_global, doc_id, edge_root):
            for paper, rel in irispy.iterator(relations_global, doc_id, edge_root, author):
                if str(rel).upper() != authored_value.upper():
                    continue

                key = (str(author).lower() if case_insensitive else str(author))

                # de-dup per (doc_id, paper) for this author
                if dedup:
                    sp = seen_pairs.setdefault(key, set())
                    pair = (int(doc_id), str(paper))
                    if pair in sp:
                        continue
                    sp.add(pair)

                entry = counts.setdefault(key, {"author": str(author), "count": 0, "papers": []})
                entry["count"] += 1

                # paper details from ^GraphContent
                paper_info = {
                    "doc_id": int(doc_id),
                    "paper_node": str(paper),  # name used in the Edge subscript
                }
                for f in ("title", "abstract", "authors", "published", "url"):
                    v = irispy.get(content_global, doc_id, f)
                    if v is not None:
                        paper_info[f] = str(v)

                entry["papers"].append(paper_info)

    # sort and trim
    items = sorted(counts.values(), key=lambda x: (-x["count"], x["author"].lower()))
    return items[:max(1, int(limit))]
# #test
# top = get_top_authors_by_paper_count(irispy, limit=5)
# import json
# print(json.dumps(top, ensure_ascii=False, indent=2))

In [111]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_top_authors_by_paper_count",
            "description": "Return the top authors ranked by number of authored papers.",
            "parameters": {
                "type": "object",
                "properties": {
                    "limit": {"type": "integer", "default": 5, "minimum": 1, "maximum": 100}
                },
                "required": []
            }
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_papers_by_author",
            "description": "Return papers authored by the specified author, optionally with metadata.",
            "parameters": {
                "type": "object",
                "properties": {
                    "author": {"type": "string", "description": "Author full name"},
                    "include_content": {"type": "boolean", "default": True}
                },
                "required": ["author"]
            }
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_papers_by_topic",
            "description": "Return papers related to the specified topic, with relations and metadata.",
            "parameters": {
                "type": "object",
                "properties": {
                    "topic": {"type": "string"},
                    "include_content": {"type": "boolean", "default": True}
                },
                "required": ["topic"]
            }
        },
    },
]

# -------- routing instruction --------
ROUTER_SYSTEM = (
    "You are a routing assistant for a graph-of-papers. "
    "Choose and call the correct tool(s). Examples:\n"
    "- 'who has written the most paper' -> call get_top_authors_by_paper_count(limit=5)\n"
    "- 'what did Harry Shomer write' -> call get_papers_by_author(author='Harry Shomer')\n"
    "- 'papers about Knowledge Graphs' -> call get_papers_by_topic(topic='Knowledge Graphs')\n"
    "After tool results come back, summarize concisely."
)

In [112]:
def run_agent(user_query: str, irispy, limit_default: int = 5, debug: bool = True):
    messages = [
        # {"role": "system", "content": "Route to the correct tool. Answer concisely after tools."},
        {"role": "system", "content": ROUTER_SYSTEM},

        {"role": "user", "content": user_query},
    ]

    def call_tools(name: str, args: dict):
        if debug:
            print(f"[Agent] → {name}({args})")
        if name == "get_top_authors_by_paper_count":
            return get_top_authors_by_paper_count(irispy, limit=int(args.get("limit", limit_default)))
        if name == "get_papers_by_author":
            return get_papers_by_author(args["author"], irispy, include_content=bool(args.get("include_content", True)))
        if name == "get_papers_by_topic":
            return get_papers_by_topic(irispy, args["topic"],  include_content=bool(args.get("include_content", True)))
        return {"error": f"unknown tool {name}"}

    for step in range(3):
        if debug: print(f"[Agent] step {step+1}")

        # Ask model what to do
        resp = send_to_llm(model, messages, tools=tools)

        msg = resp.choices[0].message
        tool_calls = msg.tool_calls or []

        # IMPORTANT: append the assistant message with tool_calls BEFORE tool outputs
        if tool_calls:
            # convert tool_calls to plain dicts (SDK objects aren’t JSON serializable)
            tc_dicts = [{
                "id": tc.id,
                "type": "function",
                "function": {"name": tc.function.name, "arguments": tc.function.arguments}
            } for tc in tool_calls]

            messages.append({
                "role": "assistant",
                "content": msg.content or "",
                "tool_calls": tc_dicts
            })

            # Execute each tool and append the tool result
            for tc in tool_calls:
                fn_name = tc.function.name
                fn_args = json.loads(tc.function.arguments or "{}")
                result = call_tools(fn_name, fn_args)

                if debug:
                    preview = json.dumps(result, ensure_ascii=False)[:200]
                    print(f"[Agent] result: {preview}...\n")

                messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "name": fn_name,
                    "content": json.dumps(result, ensure_ascii=False)
                })

            # loop continues; model will now see the tool outputs and (usually) produce final answer
            continue

        # No tool calls → final answer
        if debug: print("[Agent] ✓ Final answer")
        return msg.content

    return "Agent stopped: max steps reached."



In [114]:
print(run_agent("who has written the most paper?", irispy))


[Agent] step 1
[Agent] → get_top_authors_by_paper_count({'limit': 5})
[Agent] result: [{"author": "Xiao Huang", "count": 5, "papers": [{"doc_id": 0, "paper_node": "When To Use Graphs In Rag: A Comprehensive Analysis For Graph Retrieval-Augmented Generation", "title": "When to use Graph...

[Agent] step 2
[Agent] ✓ Final answer
The author who has written the most papers is **Xiao Huang**, with a total of **5 papers**. Here are some of the notable papers authored by him:

1. **[When to use Graphs in RAG: A Comprehensive Analysis for Graph Retrieval-Augmented Generation](http://arxiv.org/abs/2506.05690v1)** - This paper discusses the effectiveness of Graph Retrieval-Augmented Generation (GraphRAG) and introduces a benchmark for evaluating its performance.
   
2. **[A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models](http://arxiv.org/abs/2501.13958v2)** - This survey analyzes the challenges and innovations in customizing large language models using GraphR

In [115]:

print(run_agent("show me papers by Xiao Huang", irispy))

[Agent] step 1
[Agent] → get_papers_by_author({'author': 'Xiao Huang'})
[Agent] result: [{"doc_id": 0, "author": "Xiao Huang", "title": "When to use Graphs in RAG: A Comprehensive Analysis for Graph Retrieval-Augmented Generation", "abstract": "Graph retrieval-augmented generation (Graph...

[Agent] step 2
[Agent] ✓ Final answer
Xiao Huang has authored several papers, including:

1. **When to use Graphs in RAG: A Comprehensive Analysis for Graph Retrieval-Augmented Generation**  
   - **Abstract**: This paper discusses the effectiveness of Graph Retrieval-Augmented Generation (GraphRAG) in enhancing large language models (LLMs) with external knowledge, proposing a benchmark to evaluate its performance.
   - **Published**: June 6, 2025  
   - [Read more](http://arxiv.org/abs/2506.05690v1)

2. **A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models**  
   - **Abstract**: This survey analyzes Graph-based Retrieval-Augmented Generation (GraphRAG) and its inn

In [116]:
print(run_agent("papers about Knowledge Graphs", irispy))

[Agent] step 1
[Agent] → get_papers_by_topic({'topic': 'Knowledge Graphs'})
[Agent] result: [{"doc_id": 1, "paper": "Benchmarking Vector, Graph And Hybrid Retrieval Augmented Generation (Rag) Pipelines For Open Radio Access Networks (Oran)", "topic": "Knowledge Graphs", "relation": "COVERS",...

[Agent] step 2
[Agent] ✓ Final answer
Here are some recent papers related to Knowledge Graphs:

1. **Benchmarking Vector, Graph and Hybrid Retrieval Augmented Generation (RAG) Pipelines for Open Radio Access Networks (ORAN)**  
   - **Authors**: Sarat Ahmad, Zeinab Nezami, Maryam Hafeez, Syed Ali Raza Zaidi  
   - **Published**: 2025-07-04  
   - **Abstract**: This paper evaluates various RAG systems, including GraphRAG, in the context of ORAN, highlighting their performance in multi-hop reasoning.  
   - [Read more](http://arxiv.org/abs/2507.03608v2)

2. **RAG vs. GraphRAG: A Systematic Evaluation and Key Insights**  
   - **Authors**: Haoyu Han, Harry Shomer, et al.  
   - **Published**: 2025

## LLM Agent

In [134]:

def classify_query_llm(user_query: str) -> str:
    messages = [
        {"role": "system", "content": (
            "Classify the user's question as exactly one word: "
            "'aggregation' (asks for counts, most/least, top, number of) "
            "or 'general' (everything else). Reply with only that word."
        )},
        {"role": "user", "content": user_query}
    ]
    resp = send_to_llm(model, messages)  # uses your wrapper
    label = (resp.choices[0].message.content or "").strip().lower()
    return "aggregation" if "aggregation" in label else "general"

def ask_question_graphrag_agent(user_query: str,
                 irispy,
                 combined_results,
                 search_query: str | None = None,
                 debug: bool = True):
    
    qtype = classify_query_llm(user_query)
    if debug:
        print(f"[Router] classified as: {qtype}")

    if qtype == "aggregation":
        return run_agent(user_query, irispy, debug=debug)
    else:
        q_for_rag = search_query or user_query
        return ask_question_graphrag( q_for_rag, engine, emb_model, irispy)

In [135]:
query = "Who has written the most papers about Knowledge Graphs?"
combined_results = prepare_combined_results(query, engine, emb_model, irispy)
print(ask_question_graphrag_agent(query, irispy, combined_results))


[Router] classified as: aggregation
[Agent] step 1
[Agent] → get_top_authors_by_paper_count({'limit': 5})
[Agent] result: [{"author": "Xiao Huang", "count": 5, "papers": [{"doc_id": 0, "paper_node": "When To Use Graphs In Rag: A Comprehensive Analysis For Graph Retrieval-Augmented Generation", "title": "When to use Graph...

[Agent] step 2
[Agent] ✓ Final answer
The author who has written the most papers about Knowledge Graphs is **Xiao Huang**, with a total of **5 papers**. Here are some of the notable papers authored by him:

1. **[When to use Graphs in RAG: A Comprehensive Analysis for Graph Retrieval-Augmented Generation](http://arxiv.org/abs/2506.05690v1)** - This paper discusses the effectiveness of Graph Retrieval-Augmented Generation (GraphRAG) in enhancing large language models with external knowledge.
   
2. **[A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models](http://arxiv.org/abs/2501.13958v2)** - This survey analyzes the challenges and i

In [136]:
query = "which paper covers most topics?"
combined_results = prepare_combined_results(query, engine, emb_model, irispy)
print(ask_question_graphrag_agent(query, irispy, combined_results))


[Router] classified as: general
The paper titled "From Local to Global: A Graph RAG Approach to Query-Focused Summarization" covers the most topics, with a total of 25 distinct topics mentioned in its graph context. This includes areas such as Retrieval-Augmented Generation, Knowledge Graphs, and various aspects of large language models. Its comprehensive approach to query-focused summarization highlights its broad scope in the field.
[Router] classified as: general
[Router] classified as: general
[Router] classified as: aggregation
[Agent] step 1
[Agent] → get_top_authors_by_paper_count({'limit': 5})
[Agent] result: [{"author": "Xiao Huang", "count": 5, "papers": [{"doc_id": 0, "paper_node": "When To Use Graphs In Rag: A Comprehensive Analysis For Graph Retrieval-Augmented Generation", "title": "When to use Graph...

[Agent] step 2
[Agent] ✓ Final answer


In [127]:
query = "who has written paper about graphrag?"
combined_results = prepare_combined_results(query, engine, emb_model, irispy)
print(ask_question_graphrag_agent(query, irispy, combined_results))

[Router] classified as: general
['Several authors have written papers about GraphRAG, including Yukun Cao, Zengyi Gao, and Zhiyang Li in "LEGO-GraphRAG," and Boci Peng, Yun Zhu, and Yongchao Liu in "Graph Retrieval-Augmented Generation: A Survey." Additionally, Haoyu Han, Harry Shomer, and Jiliang Tang contributed to "Empowering GraphRAG with Knowledge Filtering and Integration." Other authors include Shiqi Zhang, Xiaokui Xiao, and Yiqian Huang in "Ket-Rag: A Cost-Efficient Multi-Granular Indexing Framework For Graph-Rag."']


In [128]:
query = "how many peper has Fan Ji written?"
combined_results = prepare_combined_results(query, engine, emb_model, irispy)
print(ask_question_graphrag_agent(query, irispy, combined_results))

[Router] classified as: aggregation
[Agent] step 1
[Agent] → get_papers_by_author({'author': 'Fan Ji'})
[Agent] result: []...

[Agent] step 2
[Agent] ✓ Final answer
Fan Ji has not authored any papers.


In [131]:
import gradio as gr


def compare_handler(query: str, rag_hist, graphrag_hist):
    if not query.strip():
        return rag_hist, graphrag_hist, "⚠️ Please enter a question."

    # RAG
    try:
        rag_answer = ask_question_rag(query, engine, emb_model)
    except Exception as e:
        rag_answer = f"RAG error: {e}"

    # GraphRAG Agent
    try:
        graphrag_answer = ask_question_graphrag_agent(query, irispy, combined_results)
    except Exception as e:
        graphrag_answer = f"GraphRAG error: {e}"

    rag_hist = list(rag_hist) + [(query, str(rag_answer))]
    graphrag_hist = list(graphrag_hist) + [(query, str(graphrag_answer))]
    return rag_hist, graphrag_hist, ""

def clear_histories():
    return [], [], ""



custom_css = """
:root{
  --brand-blue:  #2f3ea8;   /* navy blue */
  --brand-teal:  #00a6a6;   /* teal */
  --bg:          #f7f9fc;
  --panel:       #ffffff;
  --text:        #0f172a;
  --shadow:      0 6px 16px rgba(17, 24, 39, .08);
}

.gradio-container { background: var(--bg); color: var(--text); }
#title { text-align:center; font-size:24px; font-weight:700; margin: 12px 0 18px; color: var(--brand-blue); }
.panel-title { font-weight:700; margin: 6px 0 10px; color: var(--brand-blue); }

.gr-chatbot {
  background: var(--panel) !important;
  border-radius: 14px !important;
  box-shadow: var(--shadow) !important;
  padding: 8px !important;
  border: 1px solid rgba(47,62,168,.10) !important;
}

.gr-chatbot .message,
.gr-chatbot .message.user,
.gr-chatbot .message.bot{
  background: #fff !important;
  color: var(--text) !important;
  border-radius: 14px !important;
  padding: 10px 14px !important;
  box-shadow: 0 2px 6px rgba(0,0,0,.04);
  border-left: 4px solid var(--brand-teal);
  font-size: 16px !important;
}

/* inputs */
textarea, input, .gr-textbox, .gr-textbox textarea{
  border-radius: 12px !important;
  border: 1px solid rgba(47,62,168,.18) !important;
}
.gr-textbox textarea:focus{ outline: none !important; border-color: var(--brand-blue) !important; }

/* buttons */
.gr-button{ border-radius: 12px !important; font-weight:600 !important; }
button.primary, .gr-button-primary{
  background: var(--brand-teal) !important;
  border: none !important;
  color: #fff !important;
}
button.secondary, .gr-button-secondary{
  background: #fff !important;
  color: var(--brand-blue) !important;
  border: 1px solid rgba(47,62,168,.25) !important;
}

/* hide default Chatbot legend */
.gr-chatbot .label, .gr-chatbot .legend, .gr-chatbot > div:first-child > div:first-child {
  display: none !important;
}
"""


with gr.Blocks(css=custom_css) as demo:
    gr.HTML('<div id="title">RAG vs GraphRAG</div>')

    rag_state = gr.State([])
    graphrag_state = gr.State([])

    with gr.Row():
        query_in = gr.Textbox(
            label="Your question",
            placeholder="e.g., Who has written the most paper?"
        )

    with gr.Row():
        with gr.Column():
            gr.Markdown('<div class="panel-title">RAG</div>')
            rag_chat = gr.Chatbot(label="", height=420)
        with gr.Column():
            gr.Markdown('<div class="panel-title">GraphRAG</div>')
            graphrag_chat = gr.Chatbot(label="", height=420)

    run_btn = gr.Button("Ask Both", variant="primary")
    clear_btn = gr.Button("Clear", variant="secondary")
    status_out = gr.Markdown("")

    run_btn.click(
        fn=compare_handler,
        inputs=[query_in, rag_state, graphrag_state],
        outputs=[rag_chat, graphrag_chat, status_out]
    ).then(
        fn=lambda a,b: (a,b),
        inputs=[rag_chat, graphrag_chat],
        outputs=[rag_state, graphrag_state]
    )

    clear_btn.click(
        fn=clear_histories,
        inputs=[],
        outputs=[rag_chat, graphrag_chat, status_out]
    ).then(
        fn=lambda: ([], []),
        inputs=[],
        outputs=[rag_state, graphrag_state]
    )


  rag_chat = gr.Chatbot(label="", height=420)
  graphrag_chat = gr.Chatbot(label="", height=420)


In [132]:
demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




[Router] classified as: aggregation
[Agent] step 1
[Agent] → get_papers_by_author({'author': 'Fan Ji'})
[Agent] result: []...

[Agent] step 2
[Agent] ✓ Final answer
[Router] classified as: general
