# SKGB - Semantic Knowledge Graph Builder (Google Colab Demo)

This notebook demonstrates the full **DynamicKGConstruction** pipeline:

**PDF -> Docling Markdown -> Semantic Chunks -> itext2kg Knowledge Graph -> Visualization**

It runs Ollama with `qwen2.5` locally inside Colab (CPU or GPU).

> **Runtime**: Go to *Runtime -> Change runtime type* and select **T4 GPU** for faster LLM inference (optional but recommended).

## 1. Install Ollama

In [None]:
# Install Ollama
# sudo apt-get install zstd

# curl -fsSL https://ollama.com/install.sh | sh

# ollama serve & ollama run qwen2.5:32b & ollama pull nomic-embed-text

# ollama serve & ollama run gpt-oss:20b & ollama pull nomic-embed-text

>>> Installing ollama to /usr/local
[1m[31mERROR:[m This version requires zstd for extraction. Please install zstd and try again:
  - Debian/Ubuntu: sudo apt-get install zstd
  - RHEL/CentOS/Fedora: sudo dnf install zstd
  - Arch: sudo pacman -S zstd


In [None]:
from itext2kg.atom.models.knowledge_graph import KnowledgeGraph

In [None]:
# # Start the Ollama server in the background
# # import subprocess, time

# # ollama_proc = subprocess.Popen(
# #     ["ollama", "serve"],
# #     stdout=subprocess.DEVNULL,
# #     stderr=subprocess.DEVNULL,
# # )
# time.sleep(3)  # wait for the server to be ready
# print(f"Ollama server started (PID {ollama_proc.pid})")

Ollama server started (PID 2620)


In [None]:
# Pull the models required by the pipeline
# Using qwen2.5 (7b default) - smaller model suitable for Colab
# # Change to qwen2.5:32b if you have enough VRAM
# LLM_MODEL = "qwen2.5"  # ~4.7 GB
# EMBEDDINGS_MODEL = "nomic-embed-text"  # ~274 MB
#ollama serve & ollama pull qwen2.5:32b & ollama pull nomic-embed-text
# !ollama pull {LLM_MODEL}
# !ollama pull {EMBEDDINGS_MODEL}

In [None]:
# Verify Ollama is running and models are available
# !ollama list

## 2. Install DynamicKGConstruction

In [1]:
# Clone the repository
!git clone https://github.com/edwinidrus/DynamicKGConstruction.git 2>/dev/null || echo "Already cloned"

%cd DynamicKGConstruction

Already cloned
/content/DynamicKGConstruction


In [None]:
# ## get the numpy that workinng with the itext2kg

# !pip uninstall numpy scipy -y
# !pip cache purge
# !pip install "numpy<2.0" --force-reinstall --no-cache-dir
# !pip install scipy --force-reinstall --no-cache-dir
# !pip install itext2kg --force-reinstall --no-cache-dir

In [3]:
# Install dependencies
!pip install -q -r requirements.txt



[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m67.9/67.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m162.6/162.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m371.5/371.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2

In [4]:
import numpy as np
print(np.__version__)

1.26.4


In [None]:
# # to fix the pipeline

# # Force uninstall numpy and reinstall with correct version
# !pip uninstall numpy scipy -y
# !pip cache purge
# !pip install "numpy<2.0" --force-reinstall --no-cache-dir
# !pip install scipy --force-reinstall --no-cache-dir
# !pip install itext2kg --force-reinstall --no-cache-dir

# print("‚úÖ Installation complete!")
# print("‚ö†Ô∏è  NOW GO TO: Runtime > Restart session")
# print("‚ö†Ô∏è  Then skip this cell and run from Cell 2")

In [3]:
# Verify the SKGB package imports correctly
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline
print(f"SKGB imported successfully")

SKGB imported successfully


## 3. Upload a PDF

Upload your own PDF or use the sample download below.

In [None]:
# Enable verbose logging so we can see what's happening inside the pipeline.
# The SKGB adapter now includes built-in patches for itext2kg's IndexError bug
# (empty atomic KG lists, failed entity look-ups, etc.) - no manual patching needed.

import logging

# Show adapter-level warnings (quintuple failures, empty KGs, etc.)
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
logging.getLogger("DynamicKGConstruction.skgb.adapters.itext2kg_adapter").setLevel(logging.DEBUG)
print("Logging configured ‚Äî adapter debug messages will appear below.")

In [None]:
# (Optional) Verify the itext2kg patches are loadable
# The SKGB adapter applies these automatically when the pipeline runs.
try:
    from itext2kg.atom.models.knowledge_graph import KnowledgeGraph
    print(f"itext2kg KnowledgeGraph: {KnowledgeGraph}")
    print("itext2kg is importable ‚Äî patches will be applied at pipeline start.")
except ImportError as e:
    print(f"itext2kg import issue: {e}")

In [6]:
import os
from pathlib import Path

INPUT_DIR = Path("input_docs")
INPUT_DIR.mkdir(exist_ok=True)

# Option A: Upload from your computer
try:
    from google.colab import files
    print("Click the button below to upload a PDF file:")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        dest = INPUT_DIR / filename
        dest.write_bytes(data)
        print(f"Saved: {dest}")
except ImportError:
    print("Not running in Colab - place your PDF in input_docs/ manually")

Click the button below to upload a PDF file:


Saving 2 Months Working Package.pdf to 2 Months Working Package.pdf
Saved: input_docs/2 Months Working Package.pdf


In [None]:
# Option B: Download a sample PDF (a short Wikipedia article)
# Skip this cell if you already uploaded your own PDF above

SAMPLE_URL = "https://arxiv.org/pdf/1706.03762"  # "Attention Is All You Need"
SAMPLE_PATH = INPUT_DIR / "attention_is_all_you_need.pdf"

if not SAMPLE_PATH.exists():
    !wget -q -O "{SAMPLE_PATH}" "{SAMPLE_URL}"
    print(f"Downloaded sample PDF to {SAMPLE_PATH}")
else:
    print(f"Sample PDF already exists at {SAMPLE_PATH}")

# List all PDFs in the input directory
pdfs = list(INPUT_DIR.glob("*.pdf"))
print(f"\nPDFs in {INPUT_DIR}/: {[p.name for p in pdfs]}")

Downloaded sample PDF to input_docs/attention_is_all_you_need.pdf

PDFs in input_docs/: ['2 Months Working Package.pdf', 'attention_is_all_you_need.pdf']


## 4. Configure and Run the SKGB Pipeline

In [7]:
from pathlib import Path
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline

# Pick the first PDF found (or set your own path)
pdf_path = list(Path("input_docs").glob("*.pdf"))[0]
print(f"Input PDF: {pdf_path}")

# Create the pipeline configuration
cfg = SKGBConfig.from_out_dir(
    "skgb_output",
    llm_model="qwen2.5:32b",
    # llm_model="gpt-oss:20b",
    # embeddings_model="nomic-embed-text",
    embeddings_model="nomic-embed-text",
    ollama_base_url="http://localhost:11434",
    temperature=0.0,
    ent_threshold=0.8,
    rel_threshold=0.7,
    max_workers=2,        # keep low for Colab
    min_chunk_words=200,
    max_chunk_words=800,
    overlap_words=0,
)

print(f"\nPipeline config:")
print(f"  LLM model:        {cfg.llm_model}")
print(f"  Embeddings model: {cfg.embeddings_model}")
print(f"  Ollama URL:       {cfg.ollama_base_url}")
print(f"  Output dir:       {cfg.out_dir}")

Input PDF: input_docs/2 Months Working Package.pdf

Pipeline config:
  LLM model:        qwen2.5:32b
  Embeddings model: nomic-embed-text
  Ollama URL:       http://localhost:11434
  Output dir:       skgb_output


In [8]:
# Run the full pipeline: PDF -> Markdown -> Chunks -> Knowledge Graph
# This may take several minutes depending on the PDF size and model
result = run_pipeline(pdf_path, cfg)

print("\n" + "=" * 60)
print("Pipeline completed!")
print(f"  Markdown dir:  {result.build_docling_dir}")
print(f"  Chunks JSON:   {result.chunks_json_path}")
print(f"  KG output dir: {result.kg_output_dir}")
print(f"  Neo4j Cypher:  {result.neo4j_cypher_path}")

Processing: input_docs/2 Months Working Package.pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úì Saved parsed text to: skgb_output/build_docling/2 Months Working Package_pdf.md

Completed processing 1 files.
[2026-02-15 22:56:33] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] üîç Detected LLM Provider: Unknown
[2026-02-15 22:56:33] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] üìä Rate Limiting Config: 5 requests/batch, 4000 tokens/batch
[2026-02-15 22:56:33] [    INFO] [itext2kg.itext2kg.atom.atom] ------- Extracting Quintuples---------
[2026-02-15 22:56:35] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] üì¶ Split 18 prompts into 4 batches for Unknown API
[2026-02-15 22:56:35] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] üöÄ Processing 18 contexts in 4 batches for Unknown API
[2026-02-15 22:56:35] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] üìã Processing batch 1/4 with 5 requests (Unknown)
[2026-02-15 22:59:05] [    INFO] [itext2kg.llm_output_parsing.langchain_output_parser] 



‚úì Interactive visualization saved to skgb_output/kg_output/kg_visualization.html

Pipeline completed!
  Markdown dir:  skgb_output/build_docling
  Chunks JSON:   skgb_output/chunks_output/all_chunks.json
  KG output dir: skgb_output/kg_output
  Neo4j Cypher:  skgb_output/kg_output/neo4j_load.cypher


## 5. Explore the Results

In [9]:
# List all output files
print("Output files:")
for f in sorted(result.kg_output_dir.rglob("*")):
    if f.is_file():
        size = f.stat().st_size
        print(f"  {f.name:40s} {size:>8,} bytes")

Output files:
  construction_report.txt                       396 bytes
  kg_edges.csv                                   57 bytes
  kg_nodes.csv                                   27 bytes
  kg_visualization.html                       4,587 bytes
  knowledge_graph.graphml                       309 bytes
  knowledge_graph.json                           32 bytes
  neo4j_load.cypher                           1,089 bytes


### 5.1 Construction Report

In [10]:
report_path = result.kg_output_dir / "construction_report.txt"
print(report_path.read_text())

KNOWLEDGE GRAPH CONSTRUCTION REPORT

Total Chunks: 18
Total Entities: 0
Total Relations: 0

Processing Date: 2026-02-15 23:02:40

ATOM Parameters:
  - Entity threshold: 0.8
  - Relation threshold: 0.7
  - LLM: qwen2.5:32b (Ollama)
  - Embeddings: nomic-embed-text (Ollama)




### 5.2 Knowledge Graph JSON (Nodes & Edges)

In [11]:
import json

kg_json_path = result.kg_output_dir / "knowledge_graph.json"
kg_data = json.loads(kg_json_path.read_text())

nodes = kg_data.get("nodes", [])
edges = kg_data.get("edges", [])

print(f"Total nodes: {len(nodes)}")
print(f"Total edges: {len(edges)}")
print(f"\n--- First 10 Nodes ---")
for n in nodes[:10]:
    print(f"  {n['name']:40s}  label={n.get('label', '')}")

print(f"\n--- First 10 Edges ---")
for e in edges[:10]:
    print(f"  {e['source'][:25]:25s} --[{e['relation'][:20]}]--> {e['target'][:25]}")

Total nodes: 0
Total edges: 0

--- First 10 Nodes ---

--- First 10 Edges ---


### 5.3 Nodes & Edges as DataFrames

In [12]:
import pandas as pd

df_nodes = pd.read_csv(result.kg_output_dir / "kg_nodes.csv")
df_edges = pd.read_csv(result.kg_output_dir / "kg_edges.csv")

print(f"Nodes shape: {df_nodes.shape}")
display(df_nodes.head(10))

print(f"\nEdges shape: {df_edges.shape}")
display(df_edges.head(10))

Nodes shape: (0, 3)


Unnamed: 0,name,label,has_embeddings



Edges shape: (0, 7)


Unnamed: 0,source,target,relation,t_start,t_end,t_obs,atomic_facts


### 5.4 Interactive Knowledge Graph Visualization

In [13]:
# Display the PyVis interactive graph inline in Colab
from IPython.display import HTML, display

viz_path = result.kg_output_dir / "kg_visualization.html"
if viz_path.exists():
    display(HTML(viz_path.read_text()))
else:
    print("Visualization file not found. PyVis may not be installed.")

### 5.5 NetworkX Graph Stats

In [None]:
import networkx as nx

G = nx.read_graphml(str(result.kg_output_dir / "knowledge_graph.graphml"))

print(f"Graph type:       {type(G).__name__}")
print(f"Number of nodes:  {G.number_of_nodes()}")
print(f"Number of edges:  {G.number_of_edges()}")
print(f"Density:          {nx.density(G):.4f}")

if G.number_of_nodes() > 0:
    # Top 10 nodes by degree
    degree_sorted = sorted(G.degree(), key=lambda x: x[1], reverse=True)
    print(f"\nTop 10 nodes by degree:")
    for name, deg in degree_sorted[:10]:
        print(f"  {name:40s}  degree={deg}")

### 5.6 Semantic Chunks Preview

In [None]:
chunks = json.loads(result.chunks_json_path.read_text())
print(f"Total chunks: {len(chunks)}\n")

for i, ch in enumerate(chunks[:3]):
    print(f"--- Chunk {i} ---")
    print(f"  ID:      {ch.get('chunk_id', 'N/A')}")
    print(f"  Section: {ch.get('section_title', 'N/A')}")
    content = ch.get('content', '')
    print(f"  Content: {content[:300]}{'...' if len(content) > 300 else ''}")
    print()

## 6. Neo4j Cypher Script

The pipeline generates a Cypher `LOAD CSV` script you can run against a Neo4j instance.

In [None]:
cypher_path = result.neo4j_cypher_path
if cypher_path.exists():
    print(cypher_path.read_text())
else:
    print("Neo4j Cypher file not generated.")

## 7. Download Results

In [None]:
# Zip all outputs for download
import shutil

archive_path = shutil.make_archive("skgb_results", "zip", ".", "skgb_output")
print(f"Archive created: {archive_path}")

try:
    from google.colab import files
    files.download(archive_path)
except ImportError:
    print("Not in Colab - find the zip at:", archive_path)

## 8. Cleanup

In [None]:
# Stop the Ollama server when done
ollama_proc.terminate()
ollama_proc.wait()
print("Ollama server stopped.")