<a href="https://colab.research.google.com/github/edgarbc/My_medium_posts/blob/main/llmmify_your_site.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLMify your site

by Edgar Bermudez

August, 2025.

--

Simple demo to transfo knowledge resources (like websites, PDFs or other documents) into structured, machine-readable data that LLMs can actually learn from.

This demo is part of a blog post about this idea inspired by Karpathy's post about transforming knowledge for LLMs (https://x.com/karpathy/status/1961128638725923119) and Jeremy Howard's llms.txt idea (https://www.answer.ai/posts/2024-09-03-llmstxt.html).



## Setup imports and install dependencies

In this example I am using my personal website so I can extract info freely but you can adapt it to something you find more useful.

In [None]:
# === 0) Setup ===
!pip -q install trafilatura markdownify tldextract bs4 html5lib lxml \
                 sentence-transformers faiss-cpu rank_bm25 tqdm rich fastapi uvicorn==0.30.6 nest_asyncio

import os, re, json, time, hashlib, urllib.parse
from urllib.parse import urljoin
from collections import deque
from datetime import datetime
from tqdm.auto import tqdm
from rich import print as rprint

import requests, tldextract
from bs4 import BeautifulSoup
import trafilatura
from markdownify import markdownify as html2md

import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss

# ---- EDIT THIS ----
START_URL = "https://edgarbermudez.org/"   # your site root
MAX_PAGES   = 200
CRAWL_DEPTH = 3
TIMEOUT     = 15
USER_AGENT  = "LLMifyFAISS/0.2 (+demo)"
SAVE_DIR    = "/content/llmify_faiss"
os.makedirs(SAVE_DIR, exist_ok=True)

session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})


## Website crawling

The idea is to extract information from the resource. You can adapt this into something that makes sense depending on the knowledge source.

In [None]:
# === 1) Crawl (sitemap first, BFS fallback) ===
def same_domain(u, base):
    ed = tldextract.extract(u); eb = tldextract.extract(base)
    return ed.registered_domain == eb.registered_domain

def get(url):
    try:
        r = session.get(url, timeout=TIMEOUT, allow_redirects=True)
        if r.status_code == 200: return r
    except Exception: pass
    return None

def discover_from_sitemap(root):
    sm = urllib.parse.urljoin(root, "/sitemap.xml")
    try:
        r = session.get(sm, timeout=TIMEOUT)
        if r.status_code == 200 and "xml" in r.headers.get("Content-Type",""):
            soup = BeautifulSoup(r.text, "xml")
            return [loc.text.strip() for loc in soup.find_all("loc") if same_domain(loc.text.strip(), root)]
    except Exception: pass
    return []

def crawl(root, max_pages=MAX_PAGES, depth=CRAWL_DEPTH):
    seen, out = set(), []
    seeds = discover_from_sitemap(root) or [root]
    dq = deque([(u, 0) for u in seeds if same_domain(u, root)])
    while dq and len(out) < max_pages:
        url, d = dq.popleft()
        if url in seen or d > depth: continue
        seen.add(url)
        resp = get(url)
        if not resp: continue
        ct = resp.headers.get("Content-Type","")
        if "text/html" not in ct: continue
        out.append((url, resp.text))
        if d < depth:
            soup = BeautifulSoup(resp.text, "html5lib")
            for a in soup.find_all("a", href=True):
                nxt = urljoin(url, a["href"])
                if nxt.startswith(("mailto:", "tel:")): continue
                if same_domain(nxt, root): dq.append((nxt, d+1))
    return out

pages = crawl(START_URL)
rprint(f"[bold green]Crawled {len(pages)} pages[/bold green]")


  return ed.registered_domain == eb.registered_domain


## Knowledge extraction and formatting

The idea is to consume the information from the website it and format it into something that is easy to process by LLMs, markdown.

In [None]:
# === 2) Extract main content → Markdown + metadata ===
def extract_main_html(html):
    soup = BeautifulSoup(html, "html5lib")
    main = soup.find("main") or soup.find("article") or soup.body
    return str(main or soup)

def html_to_markdown(html):
    md = html2md(html, heading_style="ATX", strip=["script","style"])
    return re.sub(r"\n{3,}", "\n\n", md).strip()

def extract_title_date(html):
    soup = BeautifulSoup(html, "html5lib")
    title = (soup.title.text.strip() if soup.title else "") or ""
    h1 = soup.find("h1")
    if h1 and len(h1.text.strip()) > 5: title = h1.text.strip()
    date = ""
    for sel in [("meta", {"property":"article:published_time"}),
                ("meta", {"name":"date"}), ("time", {}),
                ("meta", {"property":"og:updated_time"}),
                ("meta", {"name":"pubdate"})]:
        tag = soup.find(*sel)
        if tag:
            date = tag.get("content") or tag.get("datetime") or tag.text
            if date: break
    return title[:200], date

def chunk_markdown(md, max_chars=1800, overlap=150):
    paras = re.split(r"\n\s*\n", md)
    chunks, buf = [], ""
    for p in paras:
        if len(buf) + len(p) + 2 <= max_chars:
            buf += (("\n\n" if buf else "") + p)
        else:
            if buf: chunks.append(buf.strip())
            buf = p
    if buf: chunks.append(buf.strip())
    if overlap and len(chunks) > 1:
        out = []
        for i, c in enumerate(chunks):
            if i == 0: out.append(c); continue
            prev = chunks[i-1]
            out.append((prev[-overlap:] + "\n\n" + c).strip())
        chunks = out
    return chunks

def page_id(url): return hashlib.md5(url.encode()).hexdigest()[:12]

records = []
for url, html in tqdm(pages, desc="Extracting"):
    main_html = extract_main_html(html)
    title, published_at = extract_title_date(html)
    md = html_to_markdown(main_html)
    if len(md) < 200: continue
    chunks = chunk_markdown(md)
    for i, ch in enumerate(chunks):
        records.append({
            "doc_id": page_id(url),
            "url": url,
            "title": title or url,
            "published_at": published_at,
            "chunk_id": i,
            "chunk_text": ch
        })

os.makedirs(SAVE_DIR, exist_ok=True)
with open(os.path.join(SAVE_DIR, "chunks.jsonl"), "w") as f:
    for r in records: f.write(json.dumps(r, ensure_ascii=False) + "\n")
rprint(f"[bold cyan]Saved {len(records)} chunks[/bold cyan]")


Extracting:   0%|          | 0/8 [00:00<?, ?it/s]

## Knowledge embedding

Now we want to make the extracted knowledge easy to navigate and find.

In [None]:
# === 3) Embeddings + FAISS (cosine) ===
# Normalize to use inner-product index as cosine similarity
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
texts = [r["chunk_text"] for r in records]
embs = embed_model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True).astype("float32")

index = faiss.IndexFlatIP(embs.shape[1])  # inner-product == cosine on normalized vectors
index.add(embs)

faiss.write_index(index, os.path.join(SAVE_DIR, "index.faiss"))
with open(os.path.join(SAVE_DIR, "metadata.json"), "w") as f: json.dump(records, f)
rprint("[bold cyan]FAISS index + metadata saved[/bold cyan]")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## Knowledge search

Now with the extracted knowledge in searchable space, we can query it and display the found information.

In [None]:
# === 4) Search helpers: plain, MMR, and re-ranking ===
# Optional cross-encoder reranker (comment out if you want speed only)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def faiss_search(query, k=20):
    q = embed_model.encode([query], normalize_embeddings=True).astype("float32")
    D, I = index.search(q, k)
    return [(int(i), float(s)) for i, s in zip(I[0], D[0])]

def mmr(query, candidates, lambda_mult=0.7, final_k=5):
    # candidates: list of (idx, sim)
    # Diversify by penalizing similarity to already chosen items
    chosen, rest = [], candidates.copy()
    q_vec = embed_model.encode([query], normalize_embeddings=True).astype("float32")[0]
    doc_vecs = embs[[i for i,_ in candidates]]

    while rest and len(chosen) < final_k:
        if not chosen:
            chosen.append(rest.pop(0))
            continue
        chosen_vecs = embs[[i for i,_ in chosen]]
        # compute redundancy penalty
        sims_to_chosen = np.dot(doc_vecs, chosen_vecs.T)  # cosine (normalized)
        max_red = sims_to_chosen.max(axis=1)
        # MMR score: lambda*relevance - (1-lambda)*redundancy
        rel = np.array([s for _,s in rest])
        mmr_scores = lambda_mult*rel - (1-lambda_mult)*max_red[:len(rest)]
        pick = int(np.argmax(mmr_scores))
        chosen.append(rest.pop(pick))
    return chosen[:final_k]

def search(query, k=8, rerank_top=20, diversify=True):
    cands = faiss_search(query, k=rerank_top)
    if diversify:
        cands = mmr(query, cands, lambda_mult=0.7, final_k=k)
    idxs = [i for i,_ in cands]
    pairs = [(query, records[i]["chunk_text"]) for i in idxs]
    scores = reranker.predict(pairs)  # higher is better
    reranked = [x for _,x in sorted(zip(scores, idxs), reverse=True)]
    return [records[i] for i in reranked[:k]]

def pretty(hit):
    return f"• [{hit['title']}]({hit['url']})\n  ---\n{hit['chunk_text'][:900]}..."

q = "What AI services/products does the author offer and recent projects?"
hits = search(q)
rprint(f"[bold yellow]Top hits for:[/bold yellow] {q}")
for h in hits: rprint(pretty(h))


## Simple interface: FastAPI (optional)

We can use a simple FastAPI interface to display the results for the demo.

In [None]:
# === 5) Minimal API (FastAPI) to serve retrieval from Colab ===
import nest_asyncio, uvicorn
from fastapi import FastAPI
nest_asyncio.apply()

app = FastAPI(title="LLMify-FAISS API")

@app.get("/search")
def api_search(q: str, k: int = 5):
    res = search(q, k=k)
    return [{"title": r["title"], "url": r["url"], "snippet": r["chunk_text"][:600]} for r in res]

# To run the API in Colab, uncomment:
uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [435]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [435]


## Improvements



In [None]:
# === 6) Incremental updates (recrawl & merge without rebuilding from scratch) ===
# If you re-run later, you can:
#  - recrawl
#  - extract new pages or detect changed pages
#  - embed only new/changed chunks
#  - add vectors to FAISS and append to metadata

def load_index_and_metadata():
    idx = faiss.read_index(os.path.join(SAVE_DIR, "index.faiss"))
    with open(os.path.join(SAVE_DIR, "metadata.json")) as f:
        meta = json.load(f)
    return idx, meta

def upsert_pages(new_pages):
    global index, records, embs
    # Build a quick URL->max_chunk map to detect existing coverage
    url_to_chunks = {}
    for r in records:
        url_to_chunks.setdefault(r["url"], 0)
        url_to_chunks[r["url"]] = max(url_to_chunks[r["url"]], r["chunk_id"])

    new_records = []
    for url, html in tqdm(new_pages, desc="Incremental extract"):
        main_html = extract_main_html(html)
        title, published_at = extract_title_date(html)
        md = html_to_markdown(main_html)
        if len(md) < 200: continue
        chunks = chunk_markdown(md)
        for i, ch in enumerate(chunks):
            rec = {"doc_id": page_id(url), "url": url, "title": title or url,
                   "published_at": published_at, "chunk_id": i, "chunk_text": ch}
            # naive duplicate check by hash of text
            rec_hash = hashlib.md5(ch.encode()).hexdigest()
            rec["hash"] = rec_hash
            new_records.append(rec)

    # De-dup by (url, chunk_id, hash)
    existing_hashes = set(hashlib.md5(r["chunk_text"].encode()).hexdigest() for r in records)
    add_records = [r for r in new_records if hashlib.md5(r["chunk_text"].encode()).hexdigest() not in existing_hashes]
    if not add_records:
        rprint("[bold green]No new chunks to add.[/bold green]")
        return

    add_texts = [r["chunk_text"] for r in add_records]
    add_embs = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").encode(
        add_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True
    ).astype("float32")
    index.add(add_embs)

    records.extend(add_records)
    embs = np.concatenate([embs, add_embs], axis=0)

    faiss.write_index(index, os.path.join(SAVE_DIR, "index.faiss"))
    with open(os.path.join(SAVE_DIR, "metadata.json"), "w") as f: json.dump(records, f)
    rprint(f"[bold magenta]Added {len(add_records)} new chunks[/bold magenta]")

# Example usage later:
# new_pages = crawl(START_URL, max_pages=50, depth=2)
# upsert_pages(new_pages)
