In [2]:
pip install python-docx neo4j python-dotenv openai numpy tiktoken

Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting neo4j
  Using cached neo4j-5.28.2-py3-none-any.whl.metadata (5.9 kB)
Collecting openai
  Using cached openai-1.99.6-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken
  Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Using cached python_docx-1.2.0-py3-none-any.whl (252 kB)
Using cached neo4j-5.28.2-py3-none-any.whl (313 kB)
Using cached openai-1.99.6-py3-none-any.whl (786 kB)
Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl (996 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.7/996.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl (320 kB)
Installing collected packages: python-docx, neo4j, jiter, tiktoken, openai
Successfully installed

In [3]:
 pip install --upgrade pip

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m6.4 MB/s[0m  [33m0:00:05[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install pyttsx3

Note: you may need to restart the kernel to use updated packages.


In [3]:
import neo4j 

In [5]:
# %% [markdown]
# Optimized Graph RAG (Notebook-Friendly)
# - Batch embeddings + Neo4j vector index
# - Chunked .docx ingestion
# - Single-turn Q&A (mic or text)
# - Robust error handling and env-driven models

# %% 
import os
import sys
import json
import time
import math
from typing import List, Tuple, Optional, Iterable

import numpy as np
import speech_recognition as sr
import pyttsx3
from docx import Document
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv
from openai import OpenAI

# -------------------- Config --------------------
load_dotenv()

OPENAI_API_KEY      = os.getenv("OPENAI_API_KEY")
NEO4J_URI           = os.getenv("NEO4J_URI")
NEO4J_USER          = os.getenv("NEO4J_USER")
NEO4J_PASSWORD      = os.getenv("NEO4J_PASSWORD")

# Allow model overrides without code changes
EMBEDDING_MODEL     = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")  # small is cheap & solid
CHAT_MODEL          = os.getenv("CHAT_MODEL", "gpt-4o-mini")                  # fast, good context handling

# Chunking
MAX_CHARS_PER_CHUNK = int(os.getenv("MAX_CHARS_PER_CHUNK", "2000"))  # ~ 600-800 tokens
CHUNK_OVERLAP       = int(os.getenv("CHUNK_OVERLAP", "200"))

# Retrieval
TOP_K               = int(os.getenv("TOP_K", "6"))
VECTOR_INDEX_NAME   = os.getenv("VECTOR_INDEX_NAME", "chunkEmbeddingIndex")

# TTS / STT toggles
ENABLE_TTS          = os.getenv("ENABLE_TTS", "1") == "1"

# -------------------- Clients --------------------
if not OPENAI_API_KEY:
    raise RuntimeError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=OPENAI_API_KEY)

driver = GraphDatabase.driver(NEO4J_URI, auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD))

# TTS engine (lazy init)
_tts_engine = None
def speak(text: str):
    global _tts_engine
    if not ENABLE_TTS:
        return
    if _tts_engine is None:
        _tts_engine = pyttsx3.init()
    _tts_engine.say(text)
    _tts_engine.runAndWait()

# -------------------- Utilities --------------------
def batched(iterable: Iterable, batch_size: int):
    batch = []
    for item in iterable:
        batch.append(item)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-12
    return float(np.dot(a, b) / denom)

def chunk_text(text: str, max_chars: int, overlap: int) -> List[str]:
    text = text.replace("\r\n", "\n").strip()
    if len(text) <= max_chars:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        chunks.append(text[start:end])
        if end == len(text):
            break
        start = max(0, end - overlap)
    return [c.strip() for c in chunks if c.strip()]

def extract_docx_text(path: str) -> str:
    try:
        doc = Document(path)
        return "\n".join([p.text for p in doc.paragraphs]).strip()
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return ""

def ensure_vector_index(dimensions: int):
    # Neo4j 5+ native vector index (syntax may vary with version)
    cypher = f"""
    CREATE VECTOR INDEX {VECTOR_INDEX_NAME} IF NOT EXISTS
    FOR (c:Chunk) ON (c.embedding)
    OPTIONS {{
        indexConfig: {{
            `vector.dimensions`: {dimensions},
            `vector.similarity_function`: 'COSINE'
        }}
    }}
    """
    with driver.session() as session:
        try:
            session.run(cypher)
        except Exception as e:
            print(f"[WARN] Could not create vector index (will fallback to client-side similarity): {e}")

def upsert_policy_and_chunks(policy_name: str, chunks: List[str], embeddings: List[List[float]]):
    if not chunks:
        return
    if len(chunks) != len(embeddings):
        raise ValueError("Chunks and embeddings length mismatch.")
    with driver.session() as session:
        session.run(
            """
            MERGE (p:Policy {name: $name})
            ON CREATE SET p.createdAt = timestamp()
            ON MATCH  SET p.updatedAt = timestamp()
            """,
            name=policy_name
        )
        for i, (text, emb) in enumerate(zip(chunks, embeddings)):
            session.run(
                """
                MATCH (p:Policy {name: $name})
                MERGE (c:Chunk {policy: $name, idx: $idx})
                ON CREATE SET c.text = $text, c.embedding = $embedding, c.createdAt = timestamp()
                ON MATCH  SET c.text = $text, c.embedding = $embedding, c.updatedAt = timestamp()
                MERGE (p)-[:HAS_CHUNK]->(c)
                """,
                name=policy_name, idx=i, text=text, embedding=emb
            )

def embed_texts(texts: List[str]) -> List[List[float]]:
    # Batch to respect size limits comfortably
    out: List[List[float]] = []
    for batch in batched(texts, batch_size=64):
        resp = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
        out.extend([d.embedding for d in resp.data])
    return out

def ingest_policies_from_folder(folder: str):
    if not os.path.isdir(folder):
        print(f"[ERROR] Folder not found: {folder}")
        return

    # Collect all chunks to embed in batches (fewer API calls)
    to_embed: List[Tuple[str, List[str]]] = []  # (policy_name, chunks)
    total_chunks = 0

    for fname in os.listdir(folder):
        if not fname.lower().endswith(".docx"):
            continue
        path = os.path.join(folder, fname)
        text = extract_docx_text(path)
        if not text:
            continue
        chunks = chunk_text(text, MAX_CHARS_PER_CHUNK, CHUNK_OVERLAP)
        if not chunks:
            continue
        to_embed.append((fname, chunks))
        total_chunks += len(chunks)

    if total_chunks == 0:
        print("[INFO] No .docx content to ingest.")
        return

    print(f"[INFO] Embedding {total_chunks} chunks across {len(to_embed)} documents...")
    all_chunks_flat: List[str] = []
    map_offsets: List[Tuple[str, int, int]] = []  # policy_name, start_idx, end_idx
    cursor = 0
    for policy_name, chunks in to_embed:
        all_chunks_flat.extend(chunks)
        map_offsets.append((policy_name, cursor, cursor + len(chunks)))
        cursor += len(chunks)

    embeddings = embed_texts(all_chunks_flat)
    dim = len(embeddings[0])
    ensure_vector_index(dimensions=dim)

    # Write back into Neo4j grouped by policy
    for policy_name, start, end in map_offsets:
        upsert_policy_and_chunks(policy_name, all_chunks_flat[start:end], embeddings[start:end])

    print(f"[INFO] Ingest complete. {total_chunks} chunks stored.")

def vector_query(question_embedding: List[float], top_k: int) -> List[Tuple[str, str, float]]:
    """
    Try Neo4j native vector index first. If not available, fallback to client-side similarity.
    Returns: [(policy_name, text, score), ...]
    """
    # Try native vector index
    with driver.session() as session:
        try:
            res = session.run(
                f"""
                CALL db.index.vector.query($indexName, $k, $q)
                YIELD node, score
                RETURN node.policy AS policy, node.text AS text, score
                """,
                indexName=VECTOR_INDEX_NAME, k=top_k, q=question_embedding
            )
            rows = [(r["policy"], r["text"], float(r["score"])) for r in res]
            if rows:
                return rows
        except Exception as e:
            # Fall through to client-side
            print(f"[WARN] Vector index query failed; falling back to client-side similarity: {e}")

    # Fallback: fetch embeddings client-side and compute cosine
    with driver.session() as session:
        res = session.run("MATCH (c:Chunk) RETURN c.policy AS policy, c.text AS text, c.embedding AS embedding")
        items = [(r["policy"], r["text"], np.array(r["embedding"], dtype=np.float32)) for r in res]

    q = np.array(question_embedding, dtype=np.float32)
    scored = [(p, t, cosine_sim(q, e)) for (p, t, e) in items]
    scored.sort(key=lambda x: x[2], reverse=True)
    return scored[:top_k]

def build_context(hits: List[Tuple[str, str, float]]) -> Tuple[str, List[str]]:
    """
    Compose a grounded context window with light citations.
    """
    blocks = []
    policies = []
    for i, (policy, text, score) in enumerate(hits, 1):
        policies.append(policy)
        blocks.append(f"[{i}] Policy: {policy}\n---\n{text}\n")
    return "\n\n".join(blocks), sorted(list(set(policies)))

def answer_question(question: str, max_tokens: int = 500) -> str:
    q_emb = embed_texts([question])[0]
    hits = vector_query(q_emb, TOP_K)
    context, policy_list = build_context(hits)

    system = (
        "You are a policy expert assistant. "
        "Answer strictly based on the provided context. "
        "If the answer is not contained, state what is missing and ask for clarification. "
        "Cite sources using [#] tags matching the context blocks."
    )
    user = (
        f"Question: {question}\n\n"
        f"Context blocks (ranked):\n{context}\n\n"
        "Respond concisely and include [#] citations."
    )

    resp = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.2,
        max_tokens=max_tokens,
    )
    answer = resp.choices[0].message.content.strip()

    # Store conversation
    with driver.session() as session:
        session.run(
            """
            CREATE (c:Conversation {
                question: $q,
                answer: $a,
                policies: $pols,
                createdAt: timestamp()
            })
            """,
            q=question, a=answer, pols="; ".join(policy_list)
        )
    return answer

# -------------------- Speech-To-Text (optional) --------------------
def get_speech_input(timeout: Optional[int] = None, phrase_time_limit: Optional[int] = None) -> Optional[str]:
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("Listening... Ask your policy question.")
        try:
            audio = r.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
            text = r.recognize_google(audio)
            print(f"You said: {text}")
            return text
        except sr.WaitTimeoutError:
            print("No speech detected (timeout).")
        except sr.UnknownValueError:
            print("Could not understand the audio.")
        except sr.RequestError as e:
            print(f"STT request error: {e}")
    return None

# -------------------- Public entrypoints for the notebook --------------------
def ingest_folder(folder: str = "policies"):
    """Run once or re-run when docs change."""
    ingest_policies_from_folder(folder)

def ask_once(text: Optional[str] = None, use_mic: bool = False, speak_answer: bool = False) -> Optional[str]:
    """
    Single-turn Q&A for notebooks; no infinite loop.
    - If use_mic=True, captures speech; else uses `text`.
    - Returns the model's answer (and optionally speaks it).
    """
    if use_mic:
        question = get_speech_input()
    else:
        if not text or not text.strip():
            print("[ERROR] Provide a non-empty `text` or set `use_mic=True`.")
            return None
        question = text.strip()

    if not question:
        return None
    if question.lower().strip() in {"exit", "quit"}:
        print("Exit requested.")
        return None

    answer = answer_question(question)
    print("\n--- Answer ---\n" + answer)
    if speak_answer:
        speak(answer)
    return answer

# Example cells you can run in the notebook:
# 1) Ingest documents (run when documents change)
# ingest_folder("policies")
#
# 2) Ask by text
# ask_once(text="What is the PTO carryover policy?")
#
# 3) Ask by mic (single turn)
# ask_once(use_mic=True, speak_answer=True)

In [6]:
ingest_folder("policies")

[INFO] Embedding 36 chunks across 10 documents...
[WARN] Could not create vector index (will fallback to client-side similarity): {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}


AuthError: {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}