## Import Statements

In [1]:
import os
import glob
import numpy as np
from typing import List, Dict
from sentence_transformers import SentenceTransformer, CrossEncoder
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from huggingface_hub import login

## Setup Configuration

In [3]:
NEO4J_URI = os.environ.get("NEO4J_URI")
NEO4J_USERNAME = os.environ.get("NEO4J_USERNAME")
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")

PDF_GLOB = "/content/*.pdf"
INDEX_NAME = "essay_chunk_agentspace"

EMBEDDING_MODEL_NAME = "google/embeddinggemma-300m"
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

## Embeddings

In [4]:
class GemmaEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def _normalize(self, v):
        norm = np.linalg.norm(v)
        return v / norm if norm > 0 else v

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        vectors = self.model.encode(texts, convert_to_numpy=True)
        return [self._normalize(v).tolist() for v in vectors]

    def embed_query(self, text: str) -> List[float]:
        v = self.model.encode([text], convert_to_numpy=True)[0]
        return self._normalize(v).tolist()

## Data Ingestion

In [5]:
def load_pdfs(path_glob: str):
    docs = []
    for file in glob.glob(path_glob):
        loader = PyPDFLoader(file)
        docs.extend(loader.load())
    return docs


def split_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    chunks = splitter.split_documents(docs)

    for i, c in enumerate(chunks):
        c.metadata["chunk_id"] = i
    return chunks

## Vector Store

In [6]:
def build_vectorstore(chunks, embeddings):
    return Neo4jVector.from_documents(
        documents=chunks,
        embedding=embeddings,
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        index_name=INDEX_NAME,
        node_label="Chunk",
        text_node_property="text",
        embedding_node_property="embedding",
    )

## Graph Utilities

In [7]:
def build_graph_relationships(chunks):
    graph = Neo4jGraph(
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD
    )

    by_source: Dict[str, List] = {}
    for c in chunks:
        src = c.metadata.get("source", "unknown")
        by_source.setdefault(src, []).append(c)

    for source, source_chunks in by_source.items():
        source_chunks.sort(key=lambda x: x.metadata["chunk_id"])

        graph.query(
            "MERGE (d:Document {name: $name}) SET d.chunk_count = $n",
            {"name": source, "n": len(source_chunks)}
        )

        for i, c in enumerate(source_chunks):
            graph.query(
                """
                MATCH (d:Document {name: $source})
                MATCH (c:Chunk {chunk_id: $cid})
                MERGE (c)-[:PART_OF]->(d)
                """,
                {"source": source, "cid": c.metadata["chunk_id"]}
            )

            if i < len(source_chunks) - 1:
                graph.query(
                    """
                    MATCH (c1:Chunk {chunk_id: $c1})
                    MATCH (c2:Chunk {chunk_id: $c2})
                    MERGE (c1)-[:NEXT]->(c2)
                    """,
                    {
                        "c1": c.metadata["chunk_id"],
                        "c2": source_chunks[i + 1].metadata["chunk_id"],
                    },
                )

    return graph

## Retrieval and Reranking

In [8]:
from sentence_transformers import CrossEncoder
import numpy as np

class HybridRetriever:
    def __init__(
        self,
        vectorstore,
        graph,
        reranker_model: str,
        mmr_lambda: float = 0.5,
        strong_score_threshold: float = 0.2,
    ):
        self.vectorstore = vectorstore
        self.graph = graph
        self.reranker = CrossEncoder(reranker_model)
        self.mmr_lambda = mmr_lambda
        self.strong_score_threshold = strong_score_threshold

    def _get_context(self, chunk_id: int, k: int):
        if k == 0:
            return None

        query = f"""
        MATCH (c:Chunk {{chunk_id: $cid}})
        OPTIONAL MATCH (b:Chunk)-[:NEXT*1..{k}]->(c)
        OPTIONAL MATCH (c)-[:NEXT*1..{k}]->(a:Chunk)
        RETURN c.text AS text,
               [x IN collect(DISTINCT b) | x.text] AS before,
               [x IN collect(DISTINCT a) | x.text] AS after
        """
        res = self.graph.query(query, {"cid": chunk_id})
        return res[0] if res else None

    def _mmr(self, query_embedding, doc_embeddings, docs, k):
        selected = []
        selected_embeddings = []

        for _ in range(k):
            scores = []
            for i, emb in enumerate(doc_embeddings):
                if docs[i] in selected:
                    continue

                relevance = np.dot(query_embedding, emb)
                diversity = 0
                if selected_embeddings:
                    diversity = max(
                        np.dot(emb, s_emb) for s_emb in selected_embeddings
                    )

                mmr_score = (
                    self.mmr_lambda * relevance
                    - (1 - self.mmr_lambda) * diversity
                )
                scores.append((mmr_score, i))

            if not scores:
                break

            _, idx = max(scores, key=lambda x: x[0])
            selected.append(docs[idx])
            selected_embeddings.append(doc_embeddings[idx])

        return selected

    # Troubleshoot document aware implementation
    def _mmr_document_aware(self, query_embedding, doc_embeddings, docs, k):
        selected = []
        selected_embeddings = []
        selected_docs = set()

        for _ in range(k):
            scores = []
            for i, emb in enumerate(doc_embeddings):
                doc_id = docs[i].metadata.get("source")
                if docs[i] in selected:
                    continue

                relevance = np.dot(query_embedding, emb)

                diversity = 0
                if selected_embeddings:
                    diversity = max(np.dot(emb, s_emb) for s_emb in selected_embeddings)

                doc_penalty = 0.5 if doc_id in selected_docs else 0.0

                mmr_score = (
                    self.mmr_lambda * relevance
                    - (1 - self.mmr_lambda) * diversity
                    - doc_penalty
                )
                scores.append((mmr_score, i))

            if not scores:
                break

            _, idx = max(scores, key=lambda x: x[0])
            selected.append(docs[idx])
            selected_embeddings.append(doc_embeddings[idx])
            selected_docs.add(docs[idx].metadata.get("source"))

        return selected

    def retrieve(self, query: str, top_k: int = 20, rerank_k: int = 6):
        candidates = self.vectorstore.similarity_search(query, k=top_k)

        query_emb = self.vectorstore.embedding.embed_query(query)
        doc_embs = [
            self.vectorstore.embedding.embed_query(c.page_content)
            for c in candidates
        ]

        mmr_candidates = self._mmr(
            query_embedding=query_emb,
            doc_embeddings=doc_embs,
            docs=candidates,
            k=rerank_k * 2,
        )

        rerank_inputs = []
        contexts = {}

        for c in mmr_candidates:
            cid = c.metadata.get("chunk_id")
            context = self._get_context(cid, k=1)
            contexts[cid] = context

            combined_text = c.page_content
            if context:
                combined_text = " ".join(
                    context.get("before", [])
                    + [c.page_content]
                    + context.get("after", [])
                )

            rerank_inputs.append([query, combined_text])

        scores = self.reranker.predict(rerank_inputs)

        reranked = sorted(
            zip(mmr_candidates, scores),
            key=lambda x: x[1],
            reverse=True,
        )[:rerank_k]

        results = []
        for doc, score in reranked:
            cid = doc.metadata.get("chunk_id")

            context_k = 2 if score >= self.strong_score_threshold else 0
            context = self._get_context(cid, k=context_k)

            results.append({
                "text": doc.page_content,
                "score": float(score),
                "source": doc.metadata.get("source"),
                "chunk_id": cid,
                "context": context,
            })

        return results

## Usage

In [9]:
if __name__ == "__main__":
    docs = load_pdfs(PDF_GLOB)
    chunks = split_documents(docs)

    embeddings = GemmaEmbeddings(EMBEDDING_MODEL_NAME)
    vectorstore = build_vectorstore(chunks, embeddings)
    graph = build_graph_relationships(chunks)

    retriever = HybridRetriever(vectorstore, graph, RERANKER_MODEL_NAME)

    query = "How did Douglass refine his writing?"
    results = retriever.retrieve(query)

    for i, r in enumerate(results, 1):
        print(f"\n--- Result {i} (score={r['score']:.3f}) ---")
        print(r["text"][:300])


Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

  graph = Neo4jGraph(


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



--- Result 1 (score=5.360) ---
means of learning and improving his writing. However, the mediums through which he was able 
to practice were severely limited in comparison to other white people his age, which is clearly 
highlighted through the contrast he uses between the conventional items people used to learn and 
what he actu

--- Result 2 (score=3.981) ---
determination. He then used this skill to doctor letters from masters as excusals, eventually 
earning his freedom and writing his own slave narrative, Narrative of the Life of Frederick 
Douglass. The way Franklin and Douglass learned to write and how they utilized their skills in 
writing demonstr

--- Result 3 (score=3.700) ---
similarities end there; Douglass does not hide his hand to get his work published and seen; he 
does so in order for his work to go unnoticed. The content of the work is also drastically 
different, with Franklin writing more long form content to express the nuances of his writing 
style and display

