In [6]:
# Thesis PDF to Neo4j Graph Pipeline
# This notebook:
# 1. Loads PDFs from the data folder
# 2. Extracts text using PyMuPDF
# 3. Uses GPT-4 to understand themes, categories, and relationships
# 4. Builds a graph using Neo4j
# 5. Visualizes the graph

import os
import json
from pathlib import Path
from typing import List, Dict, Any
import fitz  # PyMuPDF
from openai import OpenAI
from neo4j import GraphDatabase
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("âœ“ Imports successful")

this notebook takes thesis PDFs and turns them into neo4j graphs


In [None]:
# Configuration
DATA_FOLDER = Path("../data")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")

# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None
neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) if NEO4J_URI else None

print(f"âœ“ Configuration loaded")
print(f"  Data folder: {DATA_FOLDER.absolute()}")
print(f"  OpenAI client: {'âœ“' if openai_client else 'âœ—'}")
print(f"  Neo4j driver: {'âœ“' if neo4j_driver else 'âœ—'}")

In [None]:
# Step 1: Load PDFs from data folder and extract text

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract all text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc[page_num]
        text += page.get_text()
    doc.close()
    return text

def load_all_pdfs(data_folder: Path) -> Dict[str, str]:
    """Load all PDF files from the data folder and extract their text."""
    pdf_texts = {}
    pdf_files = list(data_folder.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {data_folder}")
        return pdf_texts
    
    for pdf_file in pdf_files:
        print(f"Extracting text from: {pdf_file.name}")
        try:
            text = extract_text_from_pdf(pdf_file)
            pdf_texts[pdf_file.name] = text
            print(f"  âœ“ Extracted {len(text)} characters")
        except Exception as e:
            print(f"  âœ— Error extracting {pdf_file.name}: {e}")
    
    return pdf_texts

# Load all PDFs
pdf_texts = load_all_pdfs(DATA_FOLDER)
print(f"\nâœ“ Loaded {len(pdf_texts)} PDF(s)")

In [None]:
# Step 4: Query and inspect Neo4j graph directly
# The graph is built and stored in Neo4j. Query it directly with Cypher!

def query_neo4j_stats(driver):
    """Query Neo4j to get graph statistics."""
    if not driver:
        print("âš  Neo4j driver not configured.")
        return
    
    with driver.session() as session:
        # Count nodes by label
        result = session.run("""
            MATCH (n)
            RETURN labels(n)[0] as label, count(n) as count
            ORDER BY count DESC
        """)
        print("\nNodes by type:")
        for record in result:
            print(f"  {record['label']}: {record['count']}")
        
        # Count relationships
        result = session.run("""
            MATCH ()-[r]->()
            RETURN type(r) as rel_type, count(r) as count
            ORDER BY count DESC
        """)
        print("\nRelationships by type:")
        for record in result:
            print(f"  {record['rel_type']}: {record['count']}")

def query_neo4j_examples(driver):
    """Run example Cypher queries on the Neo4j graph."""
    if not driver:
        print("âš  Neo4j driver not configured.")
        return
    
    with driver.session() as session:
        # Example 1: Get all themes for a document
        print("\nExample Query 1: All themes in documents")
        result = session.run("""
            MATCH (d:Document)-[:CONTAINS_THEME]->(t:Theme)
            RETURN d.name as document, collect(t.name) as themes
        """)
        for record in result:
            print(f"  {record['document']}: {', '.join(record['themes'])}")
        
        # Example 2: Get themes related to categories
        print("\nExample Query 2: Theme-Category relationships")
        result = session.run("""
            MATCH (t:Theme)-[r]->(c:Category)
            RETURN t.name as theme, type(r) as relationship, c.name as category
            LIMIT 10
        """)
        for record in result:
            print(f"  {record['theme']} --[{record['relationship']}]--> {record['category']}")

# Query Neo4j directly
print("=" * 50)
print("NEO4J GRAPH STATISTICS")
print("=" * 50)
query_neo4j_stats(neo4j_driver)
query_neo4j_examples(neo4j_driver)

print("\nðŸ’¡ Tip: Open Neo4j Browser at http://localhost:7474 to visualize the graph interactively!")
print("   The graph is stored in Neo4j - you can query it with any Cypher query!")

In [None]:
# Step 2: Use GPT-4 to extract themes, categories, and relationships

def analyze_thesis_with_gpt4(text: str, filename: str) -> Dict[str, Any]:
    """Use GPT-4 to analyze thesis text and extract themes, categories, and relationships."""
    if not openai_client:
        print("âš  OpenAI client not configured. Skipping GPT-4 analysis.")
        return {}
    
    # Truncate text if too long (GPT-4 has token limits)
    max_chars = 100000  # Adjust based on your needs
    if len(text) > max_chars:
        text = text[:max_chars] + "... [truncated]"
    
    prompt = f"""Analyze the following thesis document and extract:
1. Main themes (key topics/concepts)
2. Categories (subject areas, domains)
3. Relationships between themes and categories

Return a JSON object with this structure:
{{
    "themes": ["theme1", "theme2", ...],
    "categories": ["category1", "category2", ...],
    "relationships": [
        {{"source": "theme1", "target": "category1", "type": "belongs_to", "description": "..."}},
        {{"source": "theme1", "target": "theme2", "type": "related_to", "description": "..."}}
    ]
}}

Thesis text:
{text[:5000]}... [continues]
"""
    
    try:
        response = openai_client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an expert at analyzing academic theses and extracting structured knowledge graphs. Always return valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=2000
        )
        
        result_text = response.choices[0].message.content
        # Try to extract JSON from the response
        if "```json" in result_text:
            result_text = result_text.split("```json")[1].split("```")[0]
        elif "```" in result_text:
            result_text = result_text.split("```")[1].split("```")[0]
        
        result = json.loads(result_text.strip())
        result["filename"] = filename
        return result
    except Exception as e:
        print(f"Error analyzing {filename} with GPT-4: {e}")
        return {"themes": [], "categories": [], "relationships": [], "filename": filename}

# Analyze all PDFs
analysis_results = {}
for filename, text in pdf_texts.items():
    print(f"\nAnalyzing {filename} with GPT-4...")
    analysis = analyze_thesis_with_gpt4(text, filename)
    analysis_results[filename] = analysis
    print(f"  âœ“ Found {len(analysis.get('themes', []))} themes, {len(analysis.get('categories', []))} categories, {len(analysis.get('relationships', []))} relationships")

print(f"\nâœ“ Analysis complete for {len(analysis_results)} document(s)")

In [None]:
# Optional: Build NetworkX graph for Python visualization
# Note: This is ONLY for visualization purposes. The actual graph is stored in Neo4j.

def query_neo4j_for_visualization(driver) -> nx.Graph:
    """Query Neo4j and build a NetworkX graph ONLY for visualization purposes."""
    if not driver:
        print("âš  Neo4j driver not configured. Creating graph from analysis results for visualization.")
        return build_graph_from_analysis(analysis_results)
    
    G = nx.Graph()
    
    with driver.session() as session:
        # Get all nodes from Neo4j
        result = session.run("MATCH (n) RETURN labels(n) as labels, properties(n) as props")
        for record in result:
            labels = record["labels"]
            props = record["props"]
            node_id = props.get("name", props.get("filename", "unknown"))
            node_type = labels[0] if labels else "Unknown"
            G.add_node(node_id, node_type=node_type, **props)
        
        # Get all relationships from Neo4j
        result = session.run("""
            MATCH (a)-[r]->(b)
            RETURN a.name as source, b.name as target, type(r) as rel_type, properties(r) as props
        """)
        for record in result:
            source = record["source"]
            target = record["target"]
            rel_type = record["rel_type"]
            props = record["props"] or {}
            G.add_edge(source, target, relationship=rel_type, **props)
    
    return G

def build_graph_from_analysis(analysis_results: Dict[str, Dict[str, Any]]) -> nx.Graph:
    """Build NetworkX graph directly from analysis results (fallback for visualization)."""
    G = nx.Graph()
    
    for filename, analysis in analysis_results.items():
        doc_name = filename.replace(".pdf", "")
        G.add_node(doc_name, node_type="Document")
        
        for theme in analysis.get("themes", []):
            G.add_node(theme, node_type="Theme")
            G.add_edge(doc_name, theme, relationship="CONTAINS_THEME")
        
        for category in analysis.get("categories", []):
            G.add_node(category, node_type="Category")
            G.add_edge(doc_name, category, relationship="CONTAINS_CATEGORY")
        
        for rel in analysis.get("relationships", []):
            source = rel.get("source", "")
            target = rel.get("target", "")
            rel_type = rel.get("type", "RELATED_TO")
            G.add_edge(source, target, relationship=rel_type)
    
    return G

# Build NetworkX graph ONLY for visualization (graph is already in Neo4j)
graph = query_neo4j_for_visualization(neo4j_driver)
print(f"âœ“ NetworkX graph created for visualization: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")
print("  (Note: The actual graph database is Neo4j, NetworkX is just for Python visualization)")

In [None]:
# Step 3: Build Neo4j graph

def clear_neo4j_database(driver):
    """Clear all nodes and relationships from Neo4j."""
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")
    print("âœ“ Cleared Neo4j database")

def create_neo4j_graph(driver, analysis_results: Dict[str, Dict[str, Any]]):
    """Create nodes and relationships in Neo4j based on analysis results."""
    if not driver:
        print("âš  Neo4j driver not configured. Skipping graph creation.")
        return
    
    with driver.session() as session:
        # Clear existing data
        clear_neo4j_database(driver)
        
        # Create document nodes
        for filename, analysis in analysis_results.items():
            session.run(
                "CREATE (d:Document {name: $name, filename: $filename})",
                name=filename.replace(".pdf", ""),
                filename=filename
            )
        
        # Create theme and category nodes, and relationships
        for filename, analysis in analysis_results.items():
            doc_name = filename.replace(".pdf", "")
            
            # Create theme nodes
            for theme in analysis.get("themes", []):
                session.run(
                    """
                    MERGE (t:Theme {name: $theme_name})
                    WITH t
                    MATCH (d:Document {filename: $filename})
                    MERGE (d)-[:CONTAINS_THEME]->(t)
                    """,
                    theme_name=theme,
                    filename=filename
                )
            
            # Create category nodes
            for category in analysis.get("categories", []):
                session.run(
                    """
                    MERGE (c:Category {name: $category_name})
                    WITH c
                    MATCH (d:Document {filename: $filename})
                    MERGE (d)-[:CONTAINS_CATEGORY]->(c)
                    """,
                    category_name=category,
                    filename=filename
                )
            
            # Create relationships
            for rel in analysis.get("relationships", []):
                source = rel.get("source", "")
                target = rel.get("target", "")
                rel_type = rel.get("type", "RELATED_TO").upper()
                description = rel.get("description", "")
                
                session.run(
                    f"""
                    MATCH (s) WHERE s.name = $source AND (s:Theme OR s:Category)
                    MATCH (t) WHERE t.name = $target AND (t:Theme OR t:Category)
                    MERGE (s)-[r:{rel_type} {{description: $description}}]->(t)
                    """,
                    source=source,
                    target=target,
                    description=description
                )
        
        print("âœ“ Neo4j graph created successfully")

# Create the graph
create_neo4j_graph(neo4j_driver, analysis_results)

In [None]:
# Step 4: Query Neo4j and build NetworkX graph for visualization

def query_neo4j_graph(driver) -> nx.Graph:
    """Query Neo4j and build a NetworkX graph for visualization."""
    if not driver:
        print("âš  Neo4j driver not configured. Creating graph from analysis results.")
        return build_graph_from_analysis(analysis_results)
    
    G = nx.Graph()
    
    with driver.session() as session:
        # Get all nodes
        result = session.run("MATCH (n) RETURN labels(n) as labels, properties(n) as props")
        for record in result:
            labels = record["labels"]
            props = record["props"]
            node_id = props.get("name", props.get("filename", "unknown"))
            node_type = labels[0] if labels else "Unknown"
            G.add_node(node_id, node_type=node_type, **props)
        
        # Get all relationships
        result = session.run("""
            MATCH (a)-[r]->(b)
            RETURN a.name as source, b.name as target, type(r) as rel_type, properties(r) as props
        """)
        for record in result:
            source = record["source"]
            target = record["target"]
            rel_type = record["rel_type"]
            props = record["props"] or {}
            G.add_edge(source, target, relationship=rel_type, **props)
    
    return G

def build_graph_from_analysis(analysis_results: Dict[str, Dict[str, Any]]) -> nx.Graph:
    """Build NetworkX graph directly from analysis results (fallback)."""
    G = nx.Graph()
    
    for filename, analysis in analysis_results.items():
        doc_name = filename.replace(".pdf", "")
        G.add_node(doc_name, node_type="Document")
        
        for theme in analysis.get("themes", []):
            G.add_node(theme, node_type="Theme")
            G.add_edge(doc_name, theme, relationship="CONTAINS_THEME")
        
        for category in analysis.get("categories", []):
            G.add_node(category, node_type="Category")
            G.add_edge(doc_name, category, relationship="CONTAINS_CATEGORY")
        
        for rel in analysis.get("relationships", []):
            source = rel.get("source", "")
            target = rel.get("target", "")
            rel_type = rel.get("type", "RELATED_TO")
            G.add_edge(source, target, relationship=rel_type)
    
    return G

# Build the graph
graph = query_neo4j_graph(neo4j_driver)
print(f"âœ“ Graph built: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")

In [None]:
# Step 5: Visualize the graph using matplotlib

def visualize_graph_matplotlib(G: nx.Graph, figsize=(15, 10)):
    """Visualize the graph using matplotlib."""
    plt.figure(figsize=figsize)
    
    # Color nodes by type
    node_colors = []
    node_types = {"Document": "red", "Theme": "blue", "Category": "green"}
    
    for node in G.nodes():
        node_type = G.nodes[node].get("node_type", "Unknown")
        node_colors.append(node_types.get(node_type, "gray"))
    
    # Layout
    pos = nx.spring_layout(G, k=1, iterations=50)
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=500, alpha=0.8)
    
    # Draw edges
    nx.draw_networkx_edges(G, pos, alpha=0.5, edge_color="gray")
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=8, font_weight="bold")
    
    plt.title("Thesis Knowledge Graph", size=16, fontweight="bold")
    plt.axis("off")
    plt.tight_layout()
    plt.show()
    
    # Print legend
    print("\nNode Types:")
    for node_type, color in node_types.items():
        print(f"  {color}: {node_type}")

# Visualize with matplotlib
visualize_graph_matplotlib(graph)

In [None]:
# Alternative: Interactive visualization using Plotly

def visualize_graph_plotly(G: nx.Graph):
    """Create an interactive visualization using Plotly."""
    # Use spring layout
    pos = nx.spring_layout(G, k=1, iterations=50)
    
    # Prepare edge traces
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines'
    )
    
    # Prepare node traces by type
    node_types = {"Document": "red", "Theme": "blue", "Category": "green"}
    node_traces = []
    
    for node_type, color in node_types.items():
        node_x = []
        node_y = []
        node_text = []
        for node in G.nodes():
            if G.nodes[node].get("node_type") == node_type:
                x, y = pos[node]
                node_x.append(x)
                node_y.append(y)
                node_text.append(node)
        
        if node_x:  # Only create trace if there are nodes of this type
            node_trace = go.Scatter(
                x=node_x, y=node_y,
                mode='markers+text',
                name=node_type,
                marker=dict(
                    size=20,
                    color=color,
                    line=dict(width=2, color='white')
                ),
                text=node_text,
                textposition="middle center",
                textfont=dict(size=10, color="white"),
                hovertemplate='<b>%{text}</b><extra></extra>'
            )
            node_traces.append(node_trace)
    
    # Create figure
    fig = go.Figure(
        data=[edge_trace] + node_traces,
        layout=go.Layout(
            title='Thesis Knowledge Graph (Interactive)',
            titlefont_size=16,
            showlegend=True,
            hovermode='closest',
            margin=dict(b=20, l=5, r=5, t=40),
            annotations=[dict(
                text="Drag to pan, scroll to zoom",
                showarrow=False,
                xref="paper", yref="paper",
                x=0.005, y=-0.002,
                xanchor="left", yanchor="bottom",
                font=dict(color="#888", size=10)
            )],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
    )
    
    fig.show()

# Visualize with Plotly (interactive)
visualize_graph_plotly(graph)

In [None]:
# Summary statistics
print("=" * 50)
print("GRAPH SUMMARY")
print("=" * 50)
print(f"Total nodes: {graph.number_of_nodes()}")
print(f"Total edges: {graph.number_of_edges()}")

# Count by type
node_type_counts = {}
for node in graph.nodes():
    node_type = graph.nodes[node].get("node_type", "Unknown")
    node_type_counts[node_type] = node_type_counts.get(node_type, 0) + 1

print("\nNodes by type:")
for node_type, count in node_type_counts.items():
    print(f"  {node_type}: {count}")

print("\nâœ“ Pipeline complete!")