## Install Required Packages

Run this cell first to install all required packages. This may take a few minutes.


In [None]:
# WARNING: Python 3.14 is NOT supported by these packages
# You MUST use Python 3.11 or 3.12 for this to work
# These installations will likely fail on Python 3.14

import sys
print(f"Current Python version: {sys.version}")
print("Required: Python 3.10 - 3.13")

%pip install PyMuPDF scikit-learn networkx pyvis pandas sentence-transformers plotly
%pip install numpy scipy tqdm
%pip install llvmlite numba
%pip install hdbscan umap-learn
%pip install bertopic


# PDF Topic Knowledge Graph using BERTopic

This notebook extracts text from PDF files, trains a BERTopic model to discover topics, and creates an interactive knowledge graph visualization showing the relationships between documents, topics, and keywords.


## Import Required Libraries


In [None]:
import fitz  # PyMuPDF
import glob
from pathlib import Path
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
from pyvis.network import Network
import pandas as pd


## Configuration

Set your parameters here:
- **PDF_FOLDER_PATH**: Path to your PDF files
- **OUTPUT_FILENAME**: Name for the output HTML file
- **MIN_TOPIC_SIZE**: Minimum documents per topic (lower = more topics, higher = fewer topics)


In [None]:
# --- Configuration ---
PDF_FOLDER_PATH = r"D:\Github\phd\ML\included" 
OUTPUT_FILENAME = "pdf_knowledge_graph.html"
MIN_TOPIC_SIZE = 10  # Adjust this to get more (lower) or fewer (higher) topics

print("--- Script Started ---")


## Step 1: Extract Text from PDFs

Read all PDF files from the specified folder and extract their text content.


In [None]:
print(f"\n--- Step 1: Extracting Text from PDFs in '{PDF_FOLDER_PATH}' ---")

pdf_files = list(Path(PDF_FOLDER_PATH).glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files.")

documents = []  # List to hold the text of each PDF
doc_names = []    # List to hold the filenames

for pdf_path in pdf_files:
    try:
        doc = fitz.open(pdf_path)
        full_text = ""
        
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            full_text += page.get_text("text")
            
        documents.append(full_text)
        doc_names.append(pdf_path.name)  # Store just the filename
        doc.close()
    except Exception as e:
        print(f"  [Error] Failed to read {pdf_path}: {e}")

print(f"Successfully extracted text from {len(documents)} documents.")


## Step 2: Train BERTopic Model

Train the BERTopic model to discover topics in the documents. This is the most time-consuming step.


In [None]:
if documents:
    print(f"\n--- Step 2: Training BERTopic Model (min_topic_size={MIN_TOPIC_SIZE}) ---")

    # Create a vectorizer to remove stop words and find 1- and 2-word phrases
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

    # Initialize BERTopic
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        embedding_model="sentence-transformers/all-mpnet-base-v2",
        min_topic_size=MIN_TOPIC_SIZE,
        language="english",
        verbose=True 
    )

    # Fit the model to the extracted text
    # This is the most time-consuming step
    topics, probabilities = topic_model.fit_transform(documents)

    print("BERTopic model training complete.")
else:
    print("\nNo documents were successfully extracted. Exiting.")
    raise Exception("No documents to process")


## Step 3: Prepare Data for Knowledge Graph

Extract document-to-topic mappings, topic keywords, and other metadata from the trained model.


In [None]:
print("\n--- Step 3: Preparing Data for Knowledge Graph ---")

# 1. Get Document-to-Topic Mappings (using our filenames)
print("Getting document info...")
doc_info = topic_model.get_document_info(documents, doc_names=doc_names)

# 2. Get Topic-to-Keyword Mappings
print("Getting topic keywords...")
topic_keywords = topic_model.get_topics()

# 3. Get General Topic Info (for names, sizes, etc.)
topic_info = topic_model.get_topic_info()

print(f"Loaded {len(doc_info)} documents and {len(topic_info)} topics (including outliers).")

# Display topic information
print("\nTopic Overview:")
display(topic_info.head(10))


## Step 4: Build the Knowledge Graph

Create a NetworkX graph with:
- **Topic nodes** (red)
- **Document nodes** (blue)
- **Keyword nodes** (green)
- **Edges** connecting documents to topics, topics to keywords, and topics to related topics


In [None]:
print("\n--- Step 4: Building Knowledge Graph with NetworkX ---")

# Initialize the Graph
G = nx.Graph()

# --- 1. Add Topic Nodes ---
print("  Adding Topic nodes...")
for index, row in topic_info.iterrows():
    topic_id = row['Topic']
    if topic_id == -1:  # Skip the outlier topic
        continue
        
    G.add_node(
        f"Topic_{topic_id}", 
        type='topic', 
        size=max(row['Count'] / 5, 10),  # Scale size, ensure min size 10
        title=f"Topic {topic_id}: {row['Name']}",  # Hover-over text
        color='#f08080'  # Light red for topics
    )

# --- 2. Add Document Nodes and Document-Topic Edges ---
print("  Adding Document nodes and edges...")
for index, row in doc_info.iterrows():
    topic_id = row['Topic']
    if topic_id == -1:  # Skip outlier documents
        continue
    
    doc_name = row['Name']
    
    # Add the document node
    G.add_node(
        doc_name, 
        type='document', 
        size=5, 
        title=doc_name,
        color='#87ceeb'  # Sky blue for documents
    )
    
    # Add the edge connecting the document to its topic
    G.add_edge(doc_name, f"Topic_{topic_id}", type='belongs_to')

# --- 3. Add Keyword Nodes and Topic-Keyword Edges ---
print("  Adding Keyword nodes and edges...")
for topic_id, keywords in topic_keywords.items():
    if topic_id == -1:  # Skip outlier topic
        continue
        
    for keyword, score in keywords:
        G.add_node(
            keyword, 
            type='keyword', 
            size=max(score * 100, 5),  # Scale keyword size, ensure min size 5
            title=f"Keyword: {keyword}",
            color='#90ee90'  # Light green for keywords
        )
        
        G.add_edge(
            f"Topic_{topic_id}", 
            keyword, 
            type='has_keyword', 
            weight=score 
        )

# --- 4. (Optional) Add Topic-to-Topic Edges ---
print("  Calculating and adding topic-to-topic similarity edges...")
try:
    similarity_matrix = topic_model.topic_similarity_matrix()
    topic_ids = topic_info['Topic'].tolist()

    for i in range(len(topic_ids)):
        for j in range(i + 1, len(topic_ids)):
            
            topic_id_i = topic_ids[i]
            topic_id_j = topic_ids[j]
            
            if topic_id_i == -1 or topic_id_j == -1:  # Skip outliers
                continue
                
            similarity_score = similarity_matrix[i, j]
            
            # Add an edge if the similarity is above a certain threshold
            if similarity_score > 0.1: 
                G.add_edge(
                    f"Topic_{topic_id_i}", 
                    f"Topic_{topic_id_j}", 
                    type='related_to', 
                    weight=similarity_score,
                    title=f"Related (Score: {similarity_score:.2f})",
                    color='#cccccc'  # Gray for related topic edges
                )
except Exception as e:
    print(f"  [Warning] Could not calculate topic similarity matrix. Skipping. Error: {e}")


print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


## Step 5: Visualize and Save the Graph

Create an interactive HTML visualization using Pyvis and save it to the output file.


In [None]:
print(f"\n--- Step 5: Visualizing Graph and Saving to '{OUTPUT_FILENAME}' ---")

# Create a pyvis network
nt = Network(notebook=True, height='800px', width='100%', cdn_resources='in_line', heading='PDF Topic Knowledge Graph')

# Load the networkx graph into pyvis
nt.from_nx(G)

# Add visualization options for better physics
nt.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 12,
      "face": "Tahoma"
    }
  },
  "edges": {
    "color": {
      "inherit": false
    },
    "smooth": false
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -40000,
      "centralGravity": 0.1,
      "springLength": 120,
      "springConstant": 0.05
    },
    "maxVelocity": 50,
    "minVelocity": 0.75,
    "solver": "barnesHut"
  }
}
""")

# Save and show the interactive HTML file
nt.show(OUTPUT_FILENAME)

print(f"\n--- Script Finished ---")
print(f"Successfully created and saved interactive knowledge graph to:")
print(f"{Path(OUTPUT_FILENAME).resolve()}")


## Optional: Explore the Results

You can explore the trained model further using these commands:


In [None]:
# View all topics
topic_info


In [None]:
# View documents and their assigned topics
doc_info[['Name', 'Topic', 'Top_n_words']].head(20)


In [None]:
# View keywords for a specific topic (replace 0 with your topic number)
topic_model.get_topic(0)
