# SKGB - Semantic Knowledge Graph Builder (Colab Demo)

This notebook demonstrates the full **DynamicKGConstruction** pipeline:

**PDF -> Docling Markdown -> Semantic Chunks -> itext2kg Knowledge Graph -> Visualization**

It runs Ollama with `qwen2.5` locally inside Colab (CPU or GPU).

> **Runtime**: Go to *Runtime -> Change runtime type* and select **T4 GPU** for faster LLM inference (optional but recommended).

## 1. Install Ollama

In [1]:
# Install Ollama
# curl -fsSL https://ollama.com/install.sh | sh
# ollama serve & ollama run qwen2.5:32b & ollama pull nomic-embed-text

>>> Installing ollama to /usr/local
[1m[31mERROR:[m This version requires zstd for extraction. Please install zstd and try again:
  - Debian/Ubuntu: sudo apt-get install zstd
  - RHEL/CentOS/Fedora: sudo dnf install zstd
  - Arch: sudo pacman -S zstd


In [1]:
# # Start the Ollama server in the background
# # import subprocess, time

# # ollama_proc = subprocess.Popen(
# #     ["ollama", "serve"],
# #     stdout=subprocess.DEVNULL,
# #     stderr=subprocess.DEVNULL,
# # )
# time.sleep(3)  # wait for the server to be ready
# print(f"Ollama server started (PID {ollama_proc.pid})")

Ollama server started (PID 2620)


In [None]:
# Pull the models required by the pipeline
# Using qwen2.5 (7b default) - smaller model suitable for Colab
# # Change to qwen2.5:32b if you have enough VRAM
# LLM_MODEL = "qwen2.5"  # ~4.7 GB
# EMBEDDINGS_MODEL = "nomic-embed-text"  # ~274 MB
#ollama serve & ollama pull qwen2.5:32b & ollama pull nomic-embed-text
# !ollama pull {LLM_MODEL}
# !ollama pull {EMBEDDINGS_MODEL}

In [None]:
# Verify Ollama is running and models are available
# !ollama list

In [None]:
## to fix the pipeline

# Force uninstall numpy and reinstall with correct version
# !pip uninstall numpy scipy -y
# !pip cache purge
# !pip install "numpy<2.0" --force-reinstall --no-cache-dir
# !pip install scipy --force-reinstall --no-cache-dir
# !pip install itext2kg --force-reinstall --no-cache-dir

# print("✅ Installation complete!")
# print("⚠️  NOW GO TO: Runtime > Restart session")
# print("⚠️  Then skip this cell and run from Cell 2")

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.16.3
Uninstalling scipy-1.16.3:
  Successfully uninstalled scipy-1.16.3
Files removed: 267
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikit-learn 1.8.0 requires scipy>=1.10.0, which is not installed.
docling 

Collecting scipy
  Downloading scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.7,>=1.26.4 (from scipy)
  Downloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Downloading scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (35.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m246.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m349.9 MB/s[0m eta [36m0:00:00[0m
[?25h^C
^C


## 2. Install DynamicKGConstruction

In [2]:
# Clone the repository
!git clone https://github.com/edwinidrus/DynamicKGConstruction.git 2>/dev/null || echo "Already cloned"
%cd DynamicKGConstruction

Already cloned
/content/DynamicKGConstruction


In [3]:
# Install dependencies
!pip install -q -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tobler 0.13.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
pytensor 2.37.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [4]:
# Verify the SKGB package imports correctly
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline
print(f"SKGB imported successfully")

SKGB imported successfully


## 3. Upload a PDF

Upload your own PDF or use the sample download below.

In [5]:
import os
from pathlib import Path

INPUT_DIR = Path("input_docs")
INPUT_DIR.mkdir(exist_ok=True)

# Option A: Upload from your computer
try:
    from google.colab import files
    print("Click the button below to upload a PDF file:")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        dest = INPUT_DIR / filename
        dest.write_bytes(data)
        print(f"Saved: {dest}")
except ImportError:
    print("Not running in Colab - place your PDF in input_docs/ manually")

Click the button below to upload a PDF file:


Saving robotic for resilient supply chain.pdf to robotic for resilient supply chain.pdf
Saved: input_docs/robotic for resilient supply chain.pdf


In [6]:
# Option B: Download a sample PDF (a short Wikipedia article)
# Skip this cell if you already uploaded your own PDF above

SAMPLE_URL = "https://arxiv.org/pdf/1706.03762"  # "Attention Is All You Need"
SAMPLE_PATH = INPUT_DIR / "attention_is_all_you_need.pdf"

if not SAMPLE_PATH.exists():
    !wget -q -O "{SAMPLE_PATH}" "{SAMPLE_URL}"
    print(f"Downloaded sample PDF to {SAMPLE_PATH}")
else:
    print(f"Sample PDF already exists at {SAMPLE_PATH}")

# List all PDFs in the input directory
pdfs = list(INPUT_DIR.glob("*.pdf"))
print(f"\nPDFs in {INPUT_DIR}/: {[p.name for p in pdfs]}")

Downloaded sample PDF to input_docs/attention_is_all_you_need.pdf

PDFs in input_docs/: ['robotic for resilient supply chain.pdf', 'attention_is_all_you_need.pdf']


## 4. Configure and Run the SKGB Pipeline

In [7]:
from pathlib import Path
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline

# Pick the first PDF found (or set your own path)
pdf_path = list(Path("input_docs").glob("*.pdf"))[0]
print(f"Input PDF: {pdf_path}")

# Create the pipeline configuration
cfg = SKGBConfig.from_out_dir(
    "skgb_output",
    llm_model="qwen2.5:32b",
    # embeddings_model="nomic-embed-text",
    ollama_base_url="http://localhost:11434",
    temperature=0.0,
    ent_threshold=0.8,
    rel_threshold=0.7,
    max_workers=2,        # keep low for Colab
    min_chunk_words=200,
    max_chunk_words=800,
    overlap_words=0,
)

print(f"\nPipeline config:")
print(f"  LLM model:        {cfg.llm_model}")
print(f"  Embeddings model: {cfg.embeddings_model}")
print(f"  Ollama URL:       {cfg.ollama_base_url}")
print(f"  Output dir:       {cfg.out_dir}")

Input PDF: input_docs/robotic for resilient supply chain.pdf

Pipeline config:
  LLM model:        qwen2.5:32b
  Embeddings model: nomic-embed-text
  Ollama URL:       http://localhost:11434
  Output dir:       skgb_output


In [8]:
# Run the full pipeline: PDF -> Markdown -> Chunks -> Knowledge Graph
# This may take several minutes depending on the PDF size and model
result = run_pipeline(pdf_path, cfg)

print("\n" + "=" * 60)
print("Pipeline completed!")
print(f"  Markdown dir:  {result.build_docling_dir}")
print(f"  Chunks JSON:   {result.chunks_json_path}")
print(f"  KG output dir: {result.kg_output_dir}")
print(f"  Neo4j Cypher:  {result.neo4j_cypher_path}")

Processing: input_docs/robotic for resilient supply chain.pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✓ Saved parsed text to: skgb_output/build_docling/robotic for resilient supply chain_pdf.md
Processing: input_docs/attention_is_all_you_need.pdf
✓ Saved parsed text to: skgb_output/build_docling/attention_is_all_you_need_pdf.md

Completed processing 2 files.


RuntimeError: This event loop is already running

## 5. Explore the Results

In [None]:
# List all output files
print("Output files:")
for f in sorted(result.kg_output_dir.rglob("*")):
    if f.is_file():
        size = f.stat().st_size
        print(f"  {f.name:40s} {size:>8,} bytes")

### 5.1 Construction Report

In [None]:
report_path = result.kg_output_dir / "construction_report.txt"
print(report_path.read_text())

### 5.2 Knowledge Graph JSON (Nodes & Edges)

In [None]:
import json

kg_json_path = result.kg_output_dir / "knowledge_graph.json"
kg_data = json.loads(kg_json_path.read_text())

nodes = kg_data.get("nodes", [])
edges = kg_data.get("edges", [])

print(f"Total nodes: {len(nodes)}")
print(f"Total edges: {len(edges)}")
print(f"\n--- First 10 Nodes ---")
for n in nodes[:10]:
    print(f"  {n['name']:40s}  label={n.get('label', '')}")

print(f"\n--- First 10 Edges ---")
for e in edges[:10]:
    print(f"  {e['source'][:25]:25s} --[{e['relation'][:20]}]--> {e['target'][:25]}")

### 5.3 Nodes & Edges as DataFrames

In [None]:
import pandas as pd

df_nodes = pd.read_csv(result.kg_output_dir / "kg_nodes.csv")
df_edges = pd.read_csv(result.kg_output_dir / "kg_edges.csv")

print(f"Nodes shape: {df_nodes.shape}")
display(df_nodes.head(10))

print(f"\nEdges shape: {df_edges.shape}")
display(df_edges.head(10))

### 5.4 Interactive Knowledge Graph Visualization

In [None]:
# Display the PyVis interactive graph inline in Colab
from IPython.display import HTML, display

viz_path = result.kg_output_dir / "kg_visualization.html"
if viz_path.exists():
    display(HTML(viz_path.read_text()))
else:
    print("Visualization file not found. PyVis may not be installed.")

### 5.5 NetworkX Graph Stats

In [None]:
import networkx as nx

G = nx.read_graphml(str(result.kg_output_dir / "knowledge_graph.graphml"))

print(f"Graph type:       {type(G).__name__}")
print(f"Number of nodes:  {G.number_of_nodes()}")
print(f"Number of edges:  {G.number_of_edges()}")
print(f"Density:          {nx.density(G):.4f}")

if G.number_of_nodes() > 0:
    # Top 10 nodes by degree
    degree_sorted = sorted(G.degree(), key=lambda x: x[1], reverse=True)
    print(f"\nTop 10 nodes by degree:")
    for name, deg in degree_sorted[:10]:
        print(f"  {name:40s}  degree={deg}")

### 5.6 Semantic Chunks Preview

In [None]:
chunks = json.loads(result.chunks_json_path.read_text())
print(f"Total chunks: {len(chunks)}\n")

for i, ch in enumerate(chunks[:3]):
    print(f"--- Chunk {i} ---")
    print(f"  ID:      {ch.get('chunk_id', 'N/A')}")
    print(f"  Section: {ch.get('section_title', 'N/A')}")
    content = ch.get('content', '')
    print(f"  Content: {content[:300]}{'...' if len(content) > 300 else ''}")
    print()

## 6. Neo4j Cypher Script

The pipeline generates a Cypher `LOAD CSV` script you can run against a Neo4j instance.

In [None]:
cypher_path = result.neo4j_cypher_path
if cypher_path.exists():
    print(cypher_path.read_text())
else:
    print("Neo4j Cypher file not generated.")

## 7. Download Results

In [None]:
# Zip all outputs for download
import shutil

archive_path = shutil.make_archive("skgb_results", "zip", ".", "skgb_output")
print(f"Archive created: {archive_path}")

try:
    from google.colab import files
    files.download(archive_path)
except ImportError:
    print("Not in Colab - find the zip at:", archive_path)

## 8. Cleanup

In [None]:
# Stop the Ollama server when done
ollama_proc.terminate()
ollama_proc.wait()
print("Ollama server stopped.")