1. Decompress the .json.gz file from the scraper

In [2]:
import gzip
import json

with gzip.open("scraped_pages.json.gz", "rt", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} records")

Loaded 290 records


2. Inspect the structure

In [3]:
import pprint
pprint.pprint(data[0])

{'domain': 'https://pantelis.github.io',
 'scraped_at': '2025-11-18T19:50:40.047790+00:00',
 'text': 'What this course is all about Artificial Intelligence (AI) addresses '
         'one of the ultimate puzzles humans are trying to solve: How is it '
         'possible for a brain, whether biological or electronic, to perceive, '
         'understand, predict and manipulate a world far larger and more '
         'complicated than itself? And how do people create a machine (or '
         'computer) with those properties? To that end, AI researchers try to '
         'understand how seeing, learning, remembering and reasoning can, or '
         'should, be done. This course introduces students to the many AI '
         'concepts and techniques including perception, probabilistic '
         'reasoning over time, logical reasoning, planning with and without '
         'interactions with the environment, reinforcement learning and '
         'natural language understanding. Logistics NJIT C

3. Chunk the text

In [4]:
def chunk_text(text, max_tokens=300):
    words = text.split()
    chunks = []
    current = []

    for w in words:
        current.append(w)
        if len(current) >= max_tokens:
            chunks.append(" ".join(current))
            current = []

    if current:
        chunks.append(" ".join(current))

    return chunks

4. Generate embeddings using Ollama

Before running the embedding code, install and run ollama through the following steps: 

In [None]:
brew install ollama
brew services start ollama

ollama --version
ollama pull nomic-embed-text

Run the embedding logic and check its output

In [5]:
import requests

def embed(text, model="nomic-embed-text"):
    r = requests.post(
        "http://localhost:11434/api/embeddings",
        json={"model": model, "prompt": text}
    )
    return r.json().get("embedding")

# check the embedding outputs
vec = embed("hello world")
print(len(vec))  # should be ~768 dims
vec

768


[-0.1539008617401123,
 -0.030437219887971878,
 -3.913411855697632,
 0.19112876057624817,
 0.1329490840435028,
 1.5933750867843628,
 -0.004853460937738419,
 -0.9825364351272583,
 -0.3337099552154541,
 -1.233954668045044,
 0.011371809989213943,
 0.8944352269172668,
 0.6326407790184021,
 1.8440722227096558,
 1.0348365306854248,
 -1.435691475868225,
 0.23437033593654633,
 -0.6752147078514099,
 -0.976249098777771,
 0.675545871257782,
 -0.08448098599910736,
 -2.1511447429656982,
 -0.17357085645198822,
 0.8686138391494751,
 2.103787422180176,
 -0.3254911005496979,
 -0.341919869184494,
 1.4055447578430176,
 0.1476154923439026,
 -0.501191258430481,
 -0.02702801115810871,
 -0.24861669540405273,
 -0.005052544176578522,
 0.35727426409721375,
 0.8998191952705383,
 0.06280811131000519,
 0.7426153421401978,
 0.3942549228668213,
 0.37292855978012085,
 0.1345541775226593,
 -0.10743553936481476,
 -0.338395893573761,
 0.2736889123916626,
 0.2323034554719925,
 1.5041673183441162,
 -0.0351109616458416,
 -0

5. Process all the pages into chunks then into embeddings

In [6]:
import json

chunks_path = "chunks.jsonl"
embeddings_path = "embeddings.jsonl"

def write_jsonl(path, record):
    with open(path, "a") as f:
        f.write(json.dumps(record) + "\n")

# clear old files if rerunning
open(chunks_path, "w").close()
open(embeddings_path, "w").close()

for record in data:
    url = record.get("url")
    text = record.get("text", "")
    title = record.get("title", "")

    chunks = chunk_text(text, max_tokens=300)

    for i, ch in enumerate(chunks):
        chunk_id = f"{url}#chunk-{i}"

        # 1. Write chunk to chunks.jsonl
        write_jsonl(chunks_path, {
            "id": chunk_id,
            "url": url,
            "title": title,
            "chunk_index": i,
            "text": ch
        })

        # 2. Compute embedding
        vec = embed(ch)

        # 3. Write embedding to embeddings.jsonl
        write_jsonl(embeddings_path, {
            "id": chunk_id,
            "embedding": vec
        })

6. Validate that every chunk has an embedding

In [8]:
import json

# load chunk IDs
chunk_ids = set()
with open("chunks.jsonl") as f:
    for line in f:
        chunk_ids.add(json.loads(line)["id"])

# load embedding IDs
emb_ids = set()
with open("embeddings.jsonl") as f:
    for line in f:
        emb_ids.add(json.loads(line)["id"])

missing = chunk_ids - emb_ids

print("Total chunks:", len(chunk_ids))
print("Total embeddings:", len(emb_ids))
print("Missing embeddings:", len(missing))

if missing:
    print("These are missing:")
    for m in list(missing)[:20]:
        print(" -", m)


Total chunks: 5457
Total embeddings: 5457
Missing embeddings: 0


Next Steps: load into nano graphRAG and run 