# Breaking Graph Visualization of Arxiv Papers into components

1. Generating the NetworkX Graph
1. Visualizing Using PyVis
1. Visualizing Using Bokeh
1. Exporting as GraphML
1. Exporting as JSON

In [None]:
# install python libraries

%pip install -q networkx pyvis bokeh

## Generating the NetworkX Graph

This function creates and returns a NetworkX graph with nodes, edges, and attributes.[link text](https://)

In [None]:
import networkx as nx

def create_networkx_graph():
    # Create a directed graph
    G = nx.DiGraph()

    # Define color schemes for nodes
    COLOR_SCHEME = {
        "section": "lightblue",  # Blue for sections
        "subsection": "lightgreen",  # Green for subsections
    }

    # Add nodes for the main sections with summaries as properties
    sections = [
        ("Abstract", "Overview of LightRAG's novel approach for retrieval-augmented generation using graph structures."),
        ("1 Introduction", "Motivation and importance of enhancing retrieval-augmented generation with graph-based indexing."),
        ("2 Retrieval-Augmented Generation", "Discussion of the RAG framework, including retrieval and generation components."),
        ("3 The LightRAG Architecture", "Details of LightRAG's architecture, including graph-based text indexing and dual-level retrieval."),
        ("4 Evaluation", "Empirical evaluation of LightRAG using benchmark datasets and comparison with baseline methods."),
        ("5 Related Work", "Discussion of related work in retrieval-augmented generation and large language models for graphs."),
        ("6 Conclusion", "Summary of findings and LightRAG's contributions."),
        ("References", "List of cited works and related research."),
        ("Appendix", "Supplementary material, including extended results and technical details.")
    ]

    for section, summary in sections:
        G.add_node(section, type="section", summary=summary, color=COLOR_SCHEME["section"])

    # Add edges to represent the flow of the document
    for i in range(len(sections) - 1):
        G.add_edge(sections[i][0], sections[i + 1][0], transition_summary=f"Transition from {sections[i][0]} to {sections[i + 1][0]}.")

    # Add subsections with summaries as properties
    subsections = {
        "2 Retrieval-Augmented Generation": [
            ("Comprehensive Information Retrieval", "The indexing function must extract global information for effective query answering."),
            ("Efficient and Low-Cost Retrieval", "The indexed data structure must enable rapid and cost-efficient retrieval."),
            ("Fast Adaptation to Data Changes", "The system must quickly adapt to new information from the external knowledge base.")
        ],
        "3 The LightRAG Architecture": [
            ("Graph-based Text Indexing", "Details the process of extracting entities and relationships using LLMs and constructing a knowledge graph."),
            ("Dual-level Retrieval Paradigm", "Explains the low-level and high-level retrieval strategies for specific and abstract queries."),
            ("Retrieval-Augmented Answer Generation", "Describes how retrieved information is used by the LLM to generate contextually relevant answers."),
            ("Complexity Analysis", "Analyzes the computational complexity of LightRAG's indexing and retrieval processes.")
        ],
        "4 Evaluation": [
            ("Experimental Settings", "Describes the datasets, question generation, baselines, and evaluation metrics used in the experiments."),
            ("Comparison of LightRAG with Existing RAG Methods", "Presents the results of LightRAG compared to baseline methods across various datasets."),
            ("Ablation Studies", "Examines the impact of dual-level retrieval and graph-based indexing on LightRAG's performance."),
            ("Case Study", "Provides specific examples comparing LightRAG with baseline methods."),
            ("Model Cost and Adaptability Analysis", "Analyzes the cost and adaptability of LightRAG in dynamic environments.")
        ],
        "5 Related Work": [
            ("Retrieval-Augmented Generation with LLMs", "Reviews existing RAG approaches and their limitations."),
            ("Large Language Model for Graphs", "Explores the integration of LLMs with graph-structured data.")
        ],
        "Appendix": [
            ("Experimental Data Details", "Provides statistical information about the datasets used in the experiments."),
            ("Case Example of Retrieval-Augmented Generation in LightRAG", "Illustrates the retrieve-and-generate process with an example query."),
            ("Overview of the Prompts Used in LightRAG", "Details the prompts used for graph generation, query generation, keyword extraction, and RAG evaluation."),
            ("Case Study: Comparison Between LightRAG and the Baseline NaiveRAG", "Presents a case study comparing LightRAG with NaiveRAG.")
        ]
    }

    for section, subs in subsections.items():
        for sub, sub_summary in subs:
            G.add_node(sub, type="subsection", summary=sub_summary, color=COLOR_SCHEME["subsection"])
            G.add_edge(section, sub, transition_summary=f"Transition from {section} to {sub}.")

    # Add edges between subsections where applicable
    G.add_edge("2 Retrieval-Augmented Generation", "3 The LightRAG Architecture", transition_summary="Transition from RAG framework to LightRAG architecture.")
    G.add_edge("3 The LightRAG Architecture", "4 Evaluation", transition_summary="Transition from LightRAG architecture to evaluation.")
    G.add_edge("4 Evaluation", "5 Related Work", transition_summary="Transition from evaluation to related work.")
    G.add_edge("5 Related Work", "6 Conclusion", transition_summary="Transition from related work to conclusion.")
    G.add_edge("6 Conclusion", "References", transition_summary="Transition from conclusion to references.")
    G.add_edge("References", "Appendix", transition_summary="Transition from references to appendix.")

    return G

## Visualizing Using PyVis

This function visualizes the graph using the pyvis library.

- certain features only work with the "save to file" version below.  (text hints, etc.)

### display in notebook

In [None]:
from pyvis.network import Network
from IPython.display import display, HTML

# Create the graph
G = create_networkx_graph()

# Convert NetworkX graph to PyVis
net = Network(notebook=True, height="600px", width="100%", cdn_resources='in_line')  # Use 'in_line' for Colab
net.from_nx(G)

# Customize node and edge appearance
for node in net.nodes:
    node["color"] = G.nodes[node["id"]]["color"]  # Set node color
    node["size"] = 20  # Set node size

# Generate the HTML file
net.save_graph("graph.html")

# Read the HTML file and display it in the notebook
with open("graph.html", "r", encoding="utf-8") as f:
    html_content = f.read()

display(HTML(html_content))

### save to file

- text hints work consistently when downloaded to desktop

In [None]:
from pyvis.network import Network

def visualize_with_pyvis(G):
    # Create a PyVis network
    net = Network(notebook=True, directed=True, height="750px", width="100%", cdn_resources='in_line')

    # Add nodes and edges from the NetworkX graph
    for node in G.nodes:
        net.add_node(node, label=node, color=G.nodes[node]["color"], title=G.nodes[node]["summary"])

    for edge in G.edges:
        net.add_edge(edge[0], edge[1], title=G.edges[edge]["transition_summary"])

    # Customize the visualization
    net.toggle_physics(True)  # Enable physics for better layout

    # Enable navigation buttons by default
    net.set_options("""
    {
      "interaction": {
        "navigationButtons": true
      }
    }
    """)

    # Show the graph
    net.show("document_structure_pyvis.html")

## Visualizing Using Bokeh

This function visualizes the graph using the bokeh library.

In [None]:
from bokeh.io import show, output_file
from bokeh.plotting import figure
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, ColumnDataSource, MultiLine, HoverTool, LabelSet

def visualize_with_bokeh(G):
    # Create a Bokeh plot
    plot = figure(
        title="Document Structure",
        x_range=(-1.5, 1.5),
        y_range=(-1.5, 1.5),
        tools="pan,wheel_zoom,box_zoom,reset,hover",
        toolbar_location="below",
    )

    # Convert NetworkX graph to Bokeh GraphRenderer
    graph = GraphRenderer()

    # Add nodes and edges to the Bokeh graph
    graph.node_renderer.data_source.data = {
        "index": list(G.nodes),
        "type": [G.nodes[node]["type"] for node in G.nodes],
        "summary": [G.nodes[node]["summary"] for node in G.nodes],
        "color": [G.nodes[node]["color"] for node in G.nodes],
        "x": [0] * len(G.nodes),  # Placeholder for x-coordinates
        "y": [0] * len(G.nodes),  # Placeholder for y-coordinates
    }

    graph.edge_renderer.data_source.data = {
        "start": [edge[0] for edge in G.edges],
        "end": [edge[1] for edge in G.edges],
        "transition_summary": [G.edges[edge]["transition_summary"] for edge in G.edges],
    }

    # Use a spring layout to position nodes
    pos = nx.spring_layout(G, seed=42)
    graph_layout = {node: (pos[node][0], pos[node][1]) for node in G.nodes}
    graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

    # Update node positions in the data source
    graph.node_renderer.data_source.data["x"] = [pos[node][0] for node in G.nodes]
    graph.node_renderer.data_source.data["y"] = [pos[node][1] for node in G.nodes]

    # Style nodes
    graph.node_renderer.glyph = Circle(radius=0.1, fill_color="color", line_color="black")

    # Style edges
    graph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=2)

    # Add hover tooltips
    hover = HoverTool(
        tooltips=[
            ("Node", "@index"),
            ("Type", "@type"),
            ("Summary", "@summary"),
        ],
        line_policy="interp",
    )
    plot.add_tools(hover)

    # Add node labels
    labels = LabelSet(
        x="x",
        y="y",
        text="index",
        source=graph.node_renderer.data_source,
        text_font_size="10pt",
        text_color="black",
        x_offset=5,
        y_offset=5,
    )
    plot.add_layout(labels)

    # Add the graph to the plot
    plot.renderers.append(graph)

    # Output to HTML file
    output_file("document_structure_bokeh.html")

    # Show the plot
    show(plot)

## Exporting as GraphML
This function exports the graph as a GraphML file.

In [None]:
def export_as_graphml(G, filename="document_structure.graphml"):
    nx.write_graphml(G, filename)

## Exporting as JSON

This function exports the graph as a JSON file.

In [None]:
import json

def export_as_json(G, filename="document_structure.json"):
    # Convert the graph to a dictionary
    graph_data = nx.node_link_data(G, edges="edges")

    # Write to JSON file
    with open(filename, "w") as f:
        json.dump(graph_data, f, indent=4)

## Putting It All Together

You can now call these functions in sequence to generate, visualize, and export the graph:

In [None]:
# Step 1: Create the NetworkX graph
G = create_networkx_graph()

# Step 2: Visualize with PyVis
visualize_with_pyvis(G)

# Step 3: Visualize with Bokeh
visualize_with_bokeh(G)

# Step 4: Export as GraphML
export_as_graphml(G)

# Step 5: Export as JSON
export_as_json(G)

## Experiment with additional detail

In [None]:
import networkx as nx

def create_networkx_graph():
    # Create a directed graph
    G = nx.DiGraph()

    # Define color schemes for nodes
    COLOR_SCHEME = {
        "section": "lightblue",  # Blue for sections
        "subsection": "lightgreen",  # Green for subsections
    }

    # Add nodes for the main sections with summaries as properties
    sections = [
        ("Abstract", {
            "summary": "Overview of LightRAG's novel approach for retrieval-augmented generation using graph structures.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("1 Introduction", {
            "summary": "Motivation and importance of enhancing retrieval-augmented generation with graph-based indexing.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("2 Retrieval-Augmented Generation", {
            "summary": "Discussion of the RAG framework, including retrieval and generation components.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("3 The LightRAG Architecture", {
            "summary": "Details of LightRAG's architecture, including graph-based text indexing and dual-level retrieval.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("4 Evaluation", {
            "summary": "Empirical evaluation of LightRAG using benchmark datasets and comparison with baseline methods.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("5 Related Work", {
            "summary": "Discussion of related work in retrieval-augmented generation and large language models for graphs.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("6 Conclusion", {
            "summary": "Summary of findings and LightRAG's contributions.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("References", {
            "summary": "List of cited works and related research.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        }),
        ("Appendix", {
            "summary": "Supplementary material, including extended results and technical details.",
            "type": "section",
            "color": COLOR_SCHEME["section"]
        })
    ]

    for section, properties in sections:
        G.add_node(section, **properties)

    # Add edges to represent the flow of the document
    for i in range(len(sections) - 1):
        G.add_edge(sections[i][0], sections[i + 1][0], transition_summary=f"Transition from {sections[i][0]} to {sections[i + 1][0]}.")

    # Add subsections with summaries as properties
    subsections = {
        "2 Retrieval-Augmented Generation": [
            ("Comprehensive Information Retrieval", {
                "summary": "The indexing function must extract global information for effective query answering.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Efficient and Low-Cost Retrieval", {
                "summary": "The indexed data structure must enable rapid and cost-efficient retrieval.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Fast Adaptation to Data Changes", {
                "summary": "The system must quickly adapt to new information from the external knowledge base.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            })
        ],
        "3 The LightRAG Architecture": [
            ("Graph-based Text Indexing", {
                "summary": "Details the process of extracting entities and relationships using LLMs and constructing a knowledge graph.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Dual-level Retrieval Paradigm", {
                "summary": "Explains the low-level and high-level retrieval strategies for specific and abstract queries.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Retrieval-Augmented Answer Generation", {
                "summary": "Describes how retrieved information is used by the LLM to generate contextually relevant answers.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Complexity Analysis", {
                "summary": "Analyzes the computational complexity of LightRAG's indexing and retrieval processes.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            })
        ],
        "4 Evaluation": [
            ("Experimental Settings", {
                "summary": "Describes the datasets, question generation, baselines, and evaluation metrics used in the experiments.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Comparison of LightRAG with Existing RAG Methods", {
                "summary": "Presents the results of LightRAG compared to baseline methods across various datasets.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Ablation Studies", {
                "summary": "Examines the impact of dual-level retrieval and graph-based indexing on LightRAG's performance.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Case Study", {
                "summary": "Provides specific examples comparing LightRAG with baseline methods.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Model Cost and Adaptability Analysis", {
                "summary": "Analyzes the cost and adaptability of LightRAG in dynamic environments.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            })
        ],
        "5 Related Work": [
            ("Retrieval-Augmented Generation with LLMs", {
                "summary": "Reviews existing RAG approaches and their limitations.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Large Language Model for Graphs", {
                "summary": "Explores the integration of LLMs with graph-structured data.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            })
        ],
        "Appendix": [
            ("Experimental Data Details", {
                "summary": "Provides statistical information about the datasets used in the experiments.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Case Example of Retrieval-Augmented Generation in LightRAG", {
                "summary": "Illustrates the retrieve-and-generate process with an example query.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Overview of the Prompts Used in LightRAG", {
                "summary": "Details the prompts used for graph generation, query generation, keyword extraction, and RAG evaluation.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            }),
            ("Case Study: Comparison Between LightRAG and the Baseline NaiveRAG", {
                "summary": "Presents a case study comparing LightRAG with NaiveRAG.",
                "type": "subsection",
                "color": COLOR_SCHEME["subsection"]
            })
        ]
    }

    for section, subs in subsections.items():
        for sub, properties in subs:
            G.add_node(sub, **properties)
            G.add_edge(section, sub, transition_summary=f"Transition from {section} to {sub}.")

    # Add edges between subsections where applicable
    G.add_edge("2 Retrieval-Augmented Generation", "3 The LightRAG Architecture", transition_summary="Transition from RAG framework to LightRAG architecture.")
    G.add_edge("3 The LightRAG Architecture", "4 Evaluation", transition_summary="Transition from LightRAG architecture to evaluation.")
    G.add_edge("4 Evaluation", "5 Related Work", transition_summary="Transition from evaluation to related work.")
    G.add_edge("5 Related Work", "6 Conclusion", transition_summary="Transition from related work to conclusion.")
    G.add_edge("6 Conclusion", "References", transition_summary="Transition from conclusion to references.")
    G.add_edge("References", "Appendix", transition_summary="Transition from references to appendix.")

    return G

In [None]:
from pyvis.network import Network
from IPython.display import display, HTML

# Create the graph
G = create_networkx_graph()

# Convert NetworkX graph to PyVis
net = Network(notebook=True, height="600px", width="100%", cdn_resources='in_line')  # Use 'in_line' for Colab
net.from_nx(G)

# Customize node appearance and tooltips
for node in net.nodes:
    node_id = node["id"]
    node["color"] = G.nodes[node_id]["color"]  # Set node color
    node["size"] = 20  # Set node size

    # Create a Markdown-like tooltip using HTML
    node["title"] = f"""
    Entity: {node_id}
    Type: {G.nodes[node_id]['type']}
    Summary: {G.nodes[node_id]['summary']}
    """

# Customize edge appearance and tooltips
for edge in net.edges:
    source, target = edge["from"], edge["to"]
    edge["color"] = "gray"  # Set edge color
    edge["width"] = 2  # Set edge width

    # Create a Markdown-like tooltip using HTML
    edge["title"] = f"""
    Transition: {source} → {target}
    Summary: {G.edges[source, target]['transition_summary']}
    """

# Enable physics for interactive visualization
net.toggle_physics(True)

# Generate the HTML file
net.save_graph("graph.html")

# Read the HTML file and display it in the notebook
with open("graph.html", "r", encoding="utf-8") as f:
    html_content = f.read()

# Display the HTML content in the notebook
display(HTML(html_content))