# SKGB — Claude Sonnet 4.6 + Ollama Embeddings Demo

Builds a knowledge graph from any document using:

| Component | Model | Provider |
|-----------|-------|----------|
| LLM (entity / relation extraction) | `claude-sonnet-4-6` | Anthropic API |
| Embeddings (entity deduplication) | `nomic-embed-text` | Ollama (local) |

**Pipeline:**
```
Documents → Docling → Semantic Chunks → itext2kg ATOM → Knowledge Graph
```

## Prerequisites
- Python 3.10+
- An **Anthropic API key** (`ANTHROPIC_API_KEY`)
- **Ollama** running locally for embeddings ([install](https://ollama.com/download))

### VS Code + Google Colab
1. Install the **Google Colab** VS Code extension
2. Start a Colab runtime at colab.research.google.com
3. `Ctrl+Shift+P` → "Colab: Connect to Runtime"
4. In Colab: Runtime → Change runtime type → **T4 GPU** (recommended)

## 1. Environment Detection

In [None]:
import sys

def detect_environment():
    env = {"in_colab": False, "in_vscode": False}
    try:
        from google.colab import _is_colab_env
        env["in_colab"] = _is_colab_env()
    except ImportError:
        pass
    env["in_vscode"] = "VSCODE" in sys.prefix or hasattr(sys, "ps1")
    return env

env = detect_environment()
print(f"In Google Colab: {env['in_colab']}")
print(f"In VS Code:      {env['in_vscode']}")

## 2. Anthropic API Key

Set your key via environment variable (recommended) or enter it here.
The key is stored only in memory for this session.

In [None]:
import os
import getpass

# Option A: set via environment variable before launching the notebook:
#   export ANTHROPIC_API_KEY="sk-ant-..."
#
# Option B: enter it interactively below (key is NOT printed or saved).

if not os.environ.get("ANTHROPIC_API_KEY"):
    os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Anthropic API key: ")

# Mask the key for display
key = os.environ.get("ANTHROPIC_API_KEY", "")
masked = key[:8] + "..." + key[-4:] if len(key) > 12 else "<not set>"
print(f"ANTHROPIC_API_KEY: {masked}")

## 3. Ollama Setup (Embeddings Only)

Ollama is used **only for `nomic-embed-text` embeddings** (entity deduplication).
The LLM calls go to the Anthropic API — no large local model required.

> **Alternative:** If you prefer not to run Ollama, set `embeddings_model="text-embedding-3-small"`
> and `embeddings_api_key=<your-openai-key>` in the config cell below.

In [None]:
# Check if Ollama is available
!which ollama 2>/dev/null || echo "Ollama not found — install from https://ollama.com/download"

# If running in Colab and Ollama is not installed, uncomment:
# !curl -fsSL https://ollama.com/install.sh | sh

In [None]:
import subprocess
import time

# Start Ollama server in background (skip if already running)
try:
    import urllib.request
    urllib.request.urlopen("http://localhost:11434", timeout=2)
    print("Ollama server already running at http://localhost:11434")
    ollama_proc = None
except Exception:
    ollama_proc = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    time.sleep(3)
    print(f"Ollama server started (PID {ollama_proc.pid})")

In [None]:
# Pull the embeddings model only (~274 MB)
# Uncomment to pull:
# !ollama pull nomic-embed-text

!ollama list

## 4. Install DynamicKGConstruction

In [None]:
# Clone the repository (skip if already present)
!git clone https://github.com/edwinidrus/DynamicKGConstruction.git 2>/dev/null || echo "Already cloned"
%cd DynamicKGConstruction

In [None]:
# Install all dependencies including langchain-anthropic
!pip install -q -r requirements.txt

In [None]:
# Verify SKGB imports and show the centralized model registry
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline, ModelRegistry, LLMProvider, detect_provider
from DynamicKGConstruction.skgb.models import get_model_tier

print("SKGB imported successfully")
print()

# Show provider detection for common models
demo_models = [
    "claude-sonnet-4-6",
    "claude-opus-4-6",
    "claude-haiku-4-5-20251001",
    "qwen2.5:32b",
    "gpt-4o",
    "nomic-embed-text",
    "text-embedding-3-small",
]

print(f"{'Model':<35} {'Provider':<12} {'Tier'}")
print("-" * 60)
for m in demo_models:
    provider = detect_provider(m).value
    tier = get_model_tier(m)
    print(f"{m:<35} {provider:<12} {tier}")

## 5. Upload a Document

Supports PDF, DOCX, PPTX, XLSX, HTML, Markdown, images, and more.

In [None]:
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
logging.getLogger("DynamicKGConstruction.skgb.adapters.itext2kg_adapter").setLevel(logging.DEBUG)
print("Logging configured.")

In [None]:
import os
from pathlib import Path

INPUT_DIR = Path("input_docs")
INPUT_DIR.mkdir(exist_ok=True)

try:
    from google.colab import files
    print("Running in Colab — use the file upload button:")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        dest = INPUT_DIR / filename
        dest.write_bytes(data)
        print(f"Saved: {dest}")
except ImportError:
    print(f"Not in Colab — place your document in '{INPUT_DIR}/' manually.")
    print(f"Current directory: {os.getcwd()}")

In [None]:
# Option: download the "Attention Is All You Need" paper as a sample
SAMPLE_URL = "https://arxiv.org/pdf/1706.03762"
SAMPLE_PATH = INPUT_DIR / "attention_is_all_you_need.pdf"

if not SAMPLE_PATH.exists():
    !wget -q -O "{SAMPLE_PATH}" "{SAMPLE_URL}"
    print(f"Downloaded sample to {SAMPLE_PATH}")
else:
    print(f"Sample already exists at {SAMPLE_PATH}")

docs = list(INPUT_DIR.glob("*"))
print(f"\nDocuments in {INPUT_DIR}/: {[d.name for d in docs]}")

## 6. Configure the Pipeline

### Provider combinations

```python
# A) Claude LLM + Ollama embeddings  (this notebook's default)
llm_model        = "claude-sonnet-4-6"      # Anthropic
embeddings_model = "nomic-embed-text"        # Ollama

# B) Fully local
llm_model        = "qwen2.5:32b"            # Ollama
embeddings_model = "nomic-embed-text"        # Ollama

# C) Fully cloud (OpenAI)
llm_model        = "gpt-4o"                 # OpenAI
embeddings_model = "text-embedding-3-small" # OpenAI
```

> `SKGBConfig.from_out_dir()` auto-detects providers from model names — no extra flags needed.

In [None]:
from pathlib import Path
from DynamicKGConstruction.skgb import SKGBConfig, run_pipeline

# ── Inputs ────────────────────────────────────────────────────────────────
pdf_path = list(Path("input_docs").glob("*"))[0]
print(f"Input: {pdf_path}")

# ── Model selection (centralized) ─────────────────────────────────────────
LLM_MODEL        = "claude-sonnet-4-6"   # Anthropic — provider auto-detected
EMBEDDINGS_MODEL = "nomic-embed-text"    # Ollama    — provider auto-detected

# ── Build config ──────────────────────────────────────────────────────────
cfg = SKGBConfig.from_out_dir(
    "skgb_output",
    # Model names — providers are auto-detected via ModelRegistry
    llm_model        = LLM_MODEL,
    embeddings_model = EMBEDDINGS_MODEL,
    # API key for Claude (reads ANTHROPIC_API_KEY env var automatically if not passed)
    api_key          = os.environ.get("ANTHROPIC_API_KEY"),
    # Ollama server for embeddings
    ollama_base_url  = "http://localhost:11434",
    # KG construction parameters
    temperature      = 0.0,
    ent_threshold    = 0.8,
    rel_threshold    = 0.7,
    max_workers      = 2,     # keep low to stay within API rate limits
    min_chunk_words  = 200,
    max_chunk_words  = 800,
    overlap_words    = 50,
)

print(f"\nPipeline config:")
print(f"  LLM model:          {cfg.llm_model}")
print(f"  LLM provider:       {cfg.provider}")
print(f"  Embeddings model:   {cfg.embeddings_model}")
print(f"  Embeddings provider:{cfg.embeddings_provider}")
print(f"  Ollama URL:         {cfg.ollama_base_url}")
print(f"  Output dir:         {cfg.out_dir}")

## 7. Run the Pipeline

Stages:
1. **Docling** — parse document to Markdown
2. **Chunking** — split by headers into semantic chunks
3. **itext2kg ATOM** — extract entities/relations via Claude Sonnet 4.6
4. **Export** — write JSON, CSV, GraphML, HTML visualization, Neo4j Cypher

In [None]:
result = run_pipeline(pdf_path, cfg)

print("\n" + "=" * 60)
print("Pipeline completed!")
print(f"  Markdown dir:  {result.build_docling_dir}")
print(f"  Chunks JSON:   {result.chunks_json_path}")
print(f"  KG output dir: {result.kg_output_dir}")
print(f"  Neo4j Cypher:  {result.neo4j_cypher_path}")

## 8. Explore the Results

In [None]:
print("Output files:")
for f in sorted(result.kg_output_dir.rglob("*")):
    if f.is_file():
        size = f.stat().st_size
        print(f"  {f.name:<40s} {size:>8,} bytes")

In [None]:
# Construction report
print((result.kg_output_dir / "construction_report.txt").read_text())

In [None]:
import json

kg_data = json.loads((result.kg_output_dir / "knowledge_graph.json").read_text())
nodes = kg_data.get("nodes", [])
edges = kg_data.get("edges", [])

print(f"Total nodes: {len(nodes)}")
print(f"Total edges: {len(edges)}")
print(f"\n--- First 10 Nodes ---")
for n in nodes[:10]:
    print(f"  {n['name']:<40s}  label={n.get('label', '')}")

print(f"\n--- First 10 Edges ---")
for e in edges[:10]:
    print(f"  {e['source'][:25]:<25s} --[{e['relation'][:20]}]--> {e['target'][:25]}")

In [None]:
import pandas as pd

df_nodes = pd.read_csv(result.kg_output_dir / "kg_nodes.csv")
df_edges = pd.read_csv(result.kg_output_dir / "kg_edges.csv")

print(f"Nodes shape: {df_nodes.shape}")
display(df_nodes.head(10))

print(f"\nEdges shape: {df_edges.shape}")
display(df_edges.head(10))

In [None]:
# Interactive PyVis knowledge graph visualization
from IPython.display import HTML, display

viz_path = result.kg_output_dir / "kg_visualization.html"
if viz_path.exists():
    display(HTML(viz_path.read_text()))
else:
    print("Visualization file not found.")

In [None]:
import networkx as nx

G = nx.read_graphml(str(result.kg_output_dir / "knowledge_graph.graphml"))
print(f"Graph type:      {type(G).__name__}")
print(f"Nodes:           {G.number_of_nodes()}")
print(f"Edges:           {G.number_of_edges()}")
print(f"Density:         {nx.density(G):.4f}")

if G.number_of_nodes() > 0:
    top_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:10]
    print(f"\nTop 10 nodes by degree:")
    for name, deg in top_nodes:
        print(f"  {name:<40s}  degree={deg}")

In [None]:
chunks = json.loads(result.chunks_json_path.read_text())
print(f"Total chunks: {len(chunks)}\n")

for i, ch in enumerate(chunks[:3]):
    print(f"--- Chunk {i} ---")
    print(f"  Section: {ch.get('section_title', 'N/A')}")
    content = ch.get("content", "")
    print(f"  Content: {content[:300]}{'...' if len(content) > 300 else ''}")
    print()

## 9. Neo4j Cypher Script

In [None]:
cypher_path = result.neo4j_cypher_path
if cypher_path.exists():
    print(cypher_path.read_text())
else:
    print("Neo4j Cypher file not generated.")

## 10. Download Results

In [None]:
import shutil

archive_path = shutil.make_archive("skgb_results", "zip", ".", "skgb_output")
print(f"Archive created: {archive_path}")

try:
    from google.colab import files
    files.download(archive_path)
    print("Download initiated.")
except ImportError:
    print(f"Not in Colab — find the archive at: {archive_path}")

## 11. Cleanup

In [None]:
# Stop the Ollama server (only if we started it in this session)
if ollama_proc is not None:
    try:
        ollama_proc.terminate()
        ollama_proc.wait()
        print("Ollama server stopped.")
    except Exception as e:
        print(f"Could not stop Ollama: {e}")
else:
    print("Ollama was already running before this session — not stopped.")