In [1]:
pip install python-docx langchain langchain-openai langchain-community faiss-cpu neo4j networkx fastapi uvicorn python-dotenv pydantic tiktoken rank-bm25

Collecting langchain
  Using cached langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.29-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-macosx_14_0_arm64.whl.metadata (5.0 kB)
Collecting fastapi
  Using cached fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn
  Using cached uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Using cached langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Using cached langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Using cached 

In [4]:
# %% Config
from dotenv import load_dotenv, find_dotenv
import os, pathlib, hashlib, json, time
load_dotenv(find_dotenv(), override=True)

DOCS_PATH = os.getenv("DOCS_PATH", os.path.join(os.getcwd(), "policies"))
assert os.path.isdir(DOCS_PATH), f"Policies folder not found: {DOCS_PATH}"
print(f"[CONFIG] DOCS_PATH = {DOCS_PATH}")

[CONFIG] DOCS_PATH = /Users/drashteeparmar/Drashtee/Drashtee Projects/AI_Powered_Policy_Compliance_System/policies


In [11]:
# %% Neo4j connection (env-driven) + fallback
import os, sys, networkx as nx
from typing import List, Dict, Any, Tuple, Optional

NEO4J_URI = os.getenv("NEO4J_URI")          # e.g., neo4j://localhost:7687 or neo4j+s://<auradb>
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

try:
    from neo4j import GraphDatabase, basic_auth
except Exception as e:
    GraphDatabase = None

class GraphStore:
    def __init__(self, uri: Optional[str], user: Optional[str], pwd: Optional[str]):
        self.enabled = False
        self.driver = None
        self.G = nx.MultiDiGraph()
        if uri and GraphDatabase:
            try:
                self.driver = GraphDatabase.driver(uri, auth=basic_auth(user, pwd))
                with self.driver.session() as s:
                    s.run("RETURN 1")
                self.enabled = True
                print(f"[NEO4J] Connected: {uri}")
            except Exception as e:
                print("[NEO4J] Connection failed, using in-memory graph:", repr(e))
        else:
            print("[NEO4J] Driver not available or URI missing; using in-memory graph.")

    def close(self):
        if self.driver:
            self.driver.close()

graph_store = GraphStore(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

[NEO4J] Connected: bolt://localhost:7687


In [12]:
# %% Entities (LLM + regex fallback)
import re, json
from typing import List, Dict, Any
from openai import OpenAI

oai = OpenAI()  # reads OPENAI_API_KEY from env
print("[OPENAI] Client initialized from .env")

ENTITY_PROMPT = """Extract key entities from the text. Return JSON:
{"entities":[{"name":"string","type":"Policy|Procedure|Fee|TimeWindow|Reward|Other"}]}.
Text:
```{text}```"""

def llm_entities(text: str) -> List[Dict[str, Any]]:
    try:
        resp = oai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role":"user","content":ENTITY_PROMPT.format(text=text[:4000])}],
            temperature=0.0,
        )
        raw = resp.choices[0].message.content.strip()
        data = json.loads(re.sub(r"```json|```", "", raw))
        return data.get("entities", [])
    except Exception:
        # robust regex fallback
        ents = []
        if re.search(r"\bforeign transaction\b|\bfx fee\b|\b3%\b", text, re.I):
            ents.append({"name":"Foreign Transaction Fee","type":"Fee"})
        if re.search(r"\bdispute\b|\b60 days?\b", text, re.I):
            ents.append({"name":"Dispute Window","type":"TimeWindow"})
        if re.search(r"\brewards?\b|\bpoints\b|\b1\.5x\b", text, re.I):
            ents.append({"name":"Rewards","type":"Reward"})
        return ents

[OPENAI] Client initialized from .env


In [13]:
# %% Graph ingestion
from collections import defaultdict

def graph_upsert(chunks: List['SectionChunk']):
    # Extract entities per section
    sec_entities: Dict[str, List[Dict[str, Any]]] = {}
    for c in chunks:
        ents = llm_entities(c.text)
        sec_entities[c.section_id] = ents

    if graph_store.enabled:
        with graph_store.driver.session() as s:
            # Basic schema
            s.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Document) REQUIRE d.doc_id IS UNIQUE;")
            s.run("CREATE CONSTRAINT IF NOT EXISTS FOR (s:Section) REQUIRE s.section_id IS UNIQUE;")
            s.run("CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE;")
            # Upsert Docs & Sections
            for c in chunks:
                s.run("MERGE (d:Document {doc_id:$doc})", {"doc": c.doc_id})
                s.run("""
                    MERGE (sc:Section {section_id:$sid})
                    SET sc.heading=$h
                    WITH sc
                    MATCH (d:Document {doc_id:$doc})
                    MERGE (d)-[:HAS_SECTION]->(sc)
                """, {"sid": c.section_id, "h": c.heading, "doc": c.doc_id})
            # FOLLOWS (sequential)
            chunks_sorted = sorted(chunks, key=lambda x: (x.doc_id, x.order))
            for a, b in zip(chunks_sorted, chunks_sorted[1:]):
                if a.doc_id == b.doc_id:
                    s.run("""
                        MATCH (sa:Section {section_id:$a}),(sb:Section {section_id:$b})
                        MERGE (sa)-[:FOLLOWS]->(sb)
                    """, {"a": a.section_id, "b": b.section_id})
            # Entities + MENTIONS
            for sid, ents in sec_entities.items():
                for ent in ents:
                    s.run("MERGE (e:Entity {name:$n}) SET e.type=$t", {"n": ent["name"], "t": ent.get("type","Other")})
                    s.run("""
                        MATCH (s:Section {section_id:$sid}),(e:Entity {name:$n})
                        MERGE (s)-[:MENTIONS]->(e)
                    """, {"sid": sid, "n": ent["name"]})
        print("[GRAPH] Neo4j upsert complete.")
    else:
        G = graph_store.G
        for c in chunks:
            G.add_node(c.doc_id, label="Document")
            G.add_node(c.section_id, label="Section", heading=c.heading)
            G.add_edge(c.doc_id, c.section_id, label="HAS_SECTION")
        chunks_sorted = sorted(chunks, key=lambda x: (x.doc_id, x.order))
        for a, b in zip(chunks_sorted, chunks_sorted[1:]):
            if a.doc_id == b.doc_id:
                G.add_edge(a.section_id, b.section_id, label="FOLLOWS")
        for c in chunks:
            for ent in sec_entities[c.section_id]:
                eid = f"ENT::{ent['name']}"
                G.add_node(eid, label="Entity", type=ent.get("type","Other"))
                G.add_edge(c.section_id, eid, label="MENTIONS")
        print(f"[GRAPH] In-memory graph nodes={G.number_of_nodes()} edges={G.number_of_edges()}")

# Call after your (re)indexing step has produced `chunks`
# If you keep only `docs`, regenerate the current set of `chunks` for graph:
def rebuild_chunks_from_folder(folder: str) -> List['SectionChunk']:
    all_chunks: List['SectionChunk'] = []
    for name in os.listdir(folder):
        if name.lower().endswith(".docx"):
            all_chunks += parse_docx_heading_aware(os.path.join(folder, name))
    return all_chunks

# Upsert now
_chunks_for_graph = rebuild_chunks_from_folder(DOCS_PATH)
graph_upsert(_chunks_for_graph)

[INGEST] 6. Foreign Transaction Fees and Currency Conversion Policy.docx → 11 chunks
[INGEST] 1. Cardholder Agreement Terms.docx → 11 chunks
[INGEST] 8. Promotional Offers and Balance Transfers Policy .docx → 11 chunks
[INGEST] 5. Dispute Resolution Policy.docx → 11 chunks
[INGEST] 4. Rewards Program Terms.docx → 11 chunks
[INGEST] 9. Privacy and Data Sharing Policy.docx → 11 chunks
[INGEST] Card_Benefits_and_Fees.docx → 2 chunks
[INGEST] 3. Fraud Protection and Chargeback Policy.docx → 11 chunks
[INGEST] 10. Legal and Compliance Policy.docx → 11 chunks
[INGEST] 2. Billing & Payment Policy.docx → 11 chunks
[INGEST] 7. Card Benefits (Travel Insurance, Purchase Protection, and Warranty).docx → 9 chunks
[GRAPH] Neo4j upsert complete.


In [14]:
# %% Graph expansion utilities
def graph_expand_sections(section_ids: List[str], hops: int = 1) -> List[str]:
    if not section_ids:
        return section_ids
    if graph_store.enabled:
        with graph_store.driver.session() as s:
            expanded = set(section_ids)
            frontier = set(section_ids)
            for _ in range(hops):
                next_frontier = set()
                for sid in list(frontier):
                    res = s.run("""
                        MATCH (s:Section {section_id:$sid})
                        OPTIONAL MATCH (s)-[:FOLLOWS|HAS_SECTION|MENTIONS]->(x)
                        OPTIONAL MATCH (x)-[:FOLLOWS|HAS_SECTION|MENTIONS]->(y)
                        WITH COLLECT(DISTINCT x) + COLLECT(DISTINCT y) AS n
                        UNWIND n AS z
                        WITH DISTINCT z
                        WHERE z:Section
                        RETURN z.section_id as sid
                    """, {"sid": sid})
                    for r in res:
                        if r["sid"] not in expanded:
                            next_frontier.add(r["sid"])
                expanded |= next_frontier
                frontier = next_frontier
            return list(expanded)
    else:
        G = graph_store.G
        expanded = set(section_ids)
        frontier = set(section_ids)
        for _ in range(hops):
            nxt = set()
            for sid in list(frontier):
                for u, v, data in G.out_edges(sid, data=True):
                    if data.get("label") in {"FOLLOWS","HAS_SECTION","MENTIONS"} and G.nodes[v].get("label")=="Section":
                        nxt.add(v)
                for u, v, data in G.in_edges(sid, data=True):
                    if data.get("label") in {"FOLLOWS","HAS_SECTION","MENTIONS"} and G.nodes[u].get("label")=="Section":
                        nxt.add(u)
            expanded |= nxt
            frontier = nxt
        return list(expanded)

# Hybrid + Graph expansion: replace your hybrid_retrieve with this wrapper
def hybrid_retrieve_graph(query: str, k_dense: int=5, k_kw: int=5, graph_hops: int=1):
    base = hybrid_retrieve(query, k_dense=k_dense, k_kw=k_kw)  # uses FAISS+BM25
    seed_sids = [d.metadata["section_id"] for d in base]
    expanded_sids = graph_expand_sections(seed_sids, hops=graph_hops)
    # materialize expanded sections back to Documents
    sid_set = set(expanded_sids)
    out = []
    seen = set()
    for d in docs:
        sid = d.metadata["section_id"]
        if sid in sid_set and sid not in seen:
            out.append(d); seen.add(sid)
    print(f"[GRAPH-RETRIEVE] seeds={len(seed_sids)} expanded={len(expanded_sids)} final={len(out)}")
    # keep it tight for synthesis
    return out[: max(k_dense, k_kw) + 4]

In [15]:
# %% Synthesis (grounded)
SYSTEM = """You are a customer-support QA assistant.
- Ground all answers ONLY on provided CONTEXT.
- If insufficient, say "needs clarification" and ask a precise follow-up.
- Output BOTH: (1) concise natural-language answer with [doc_id:section_id] citations, and (2) a strict JSON block matching the schema.
"""

JSON_SCHEMA = """
{
  "answer": "string",
  "support": [{"doc_id":"string","section":"string","snippet":"string","score": 0.0}],
  "entities": [{"name":"string","type":"string"}],
  "disposition": "final|needs_clarification",
  "confidence": 0.0,
  "recommended_next_steps": ["string"]
}
"""

memory: List[Dict[str, Any]] = []

def summarize_history(history: List[Dict[str, Any]]) -> str:
    if not history: return ""
    last = history[-1]
    return f"Prev Q: {last.get('user','')}\nPrev A: {last.get('answer','')[:400]}"

def remember_salient_ids(retrieved_docs: List[Any]) -> List[str]:
    return list({d.metadata["section_id"] for d in retrieved_docs})

def synthesize_answer(user_query: str, retrieved_docs: List[Any]) -> Tuple[str, Dict[str, Any]]:
    context_blocks, support = [], []
    for d in retrieved_docs:
        sid = d.metadata["section_id"]; docid = d.metadata["doc_id"]
        context_blocks.append(f"[{docid}:{sid}] {d.page_content}")
        support.append({"doc_id": docid, "section": sid, "snippet": d.page_content[:400], "score": 0.0})

    context_text = "\n\n".join(context_blocks) if context_blocks else "(no context)"
    hist_summary = summarize_history(memory)

    prompt = f"""{SYSTEM}
JSON schema:
{JSON_SCHEMA}

HISTORY:
{hist_summary}

CONTEXT:
{context_text}

USER QUESTION:
{user_query}

Respond with:
1) 2-4 sentence answer with [doc_id:section_id] citations.
2) A JSON object strictly following the schema.
"""
    resp = oai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"You are precise and follow schemas."},
                  {"role":"user","content":prompt}],
        temperature=0.0,
    )
    txt = resp.choices[0].message.content
    first = txt.find("{")
    if first == -1:
        nat = txt.strip()
        js = {"answer": nat, "support": support, "entities": [], "disposition":"needs_clarification", "confidence":0.3, "recommended_next_steps":[]}
    else:
        nat = txt[:first].strip()
        js_raw = txt[first:].strip()
        js_clean = re.sub(r"```json|```", "", js_raw)
        try:
            js = json.loads(js_clean)
            if not js.get("support"): js["support"] = support
        except Exception:
            js = {"answer": nat, "support": support, "entities": [], "disposition":"final", "confidence":0.6, "recommended_next_steps":[]}

    print("[SYNTH] Answer:")
    print(nat)
    return nat, js

def answer_query(query: str, graph_hops: int=1):
    retrieved = hybrid_retrieve_graph(query, k_dense=5, k_kw=5, graph_hops=graph_hops)
    nat, js = synthesize_answer(query, retrieved)
    memory.append({"user": query, "answer": nat, "salient_ids": remember_salient_ids(retrieved)})
    return nat, js

In [17]:
pip install python-multipart

Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Installing collected packages: python-multipart
Successfully installed python-multipart-0.0.20
Note: you may need to restart the kernel to use updated packages.


In [18]:
# %% STT + API
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
import tempfile
import uvicorn

try:
    app  # reuse if already created
except NameError:
    app = FastAPI(title="Customer Support Graph-RAG")

def transcribe_audio(file_path: str) -> Dict[str, Any]:
    with open(file_path, "rb") as f:
        tr = oai.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json"
        )
    text = getattr(tr, "text", "") or ""
    segs = [dict(s) for s in getattr(tr, "segments", [])] if hasattr(tr, "segments") else []
    print(f"[STT] text_len={len(text)} segments={len(segs)}")
    return {"text": text, "segments": segs}

@app.post("/stt")
async def stt_endpoint(audio: UploadFile = File(...), graph_hops: int = 1):
    with tempfile.NamedTemporaryFile(delete=False) as tmp:
        tmp.write(await audio.read())
        tmp_path = tmp.name
    stt = transcribe_audio(tmp_path)
    q = stt["text"].strip()
    if not q:
        return JSONResponse({"error":"empty transcription"}, status_code=400)
    nat, js = answer_query(q, graph_hops=graph_hops)
    return JSONResponse({"transcript": q, "answer": nat, "json": js})

@app.post("/chat")
async def chat_endpoint(query: str = Form(...), graph_hops: int = 1):
    print(f"[CHAT] {query}")
    nat, js = answer_query(query, graph_hops=graph_hops)
    return JSONResponse({"answer": nat, "json": js})

# Optional: hot reindex without restart (works with your dynamic indexer)
@app.post("/reindex")
def reindex_api(force: bool=False):
    print("[API] /reindex called")
    reindex_policies(force=force)
    # refresh graph too
    _ch = rebuild_chunks_from_folder(DOCS_PATH)
    graph_upsert(_ch)
    return JSONResponse({"ok": True, "docs": len(docs)})

In [19]:
# %% E2E smoke tests (text)
q1 = "If I use my card abroad, will I be charged a fee?"
nat, js = answer_query(q1, graph_hops=1)
print("\n--- NAT ---\n", nat)
print("\n--- JSON ---\n", json.dumps(js, indent=2)[:1000])

q2 = "What is the dispute reporting window?"
nat, _ = answer_query(q2, graph_hops=1)
print("\n--- NAT ---\n", nat)

[RETRIEVE] dense=5 kw=5 → fused=10
[GRAPH-RETRIEVE] seeds=5 expanded=5 final=5
[SYNTH] Answer:
Yes, if you use your card abroad, you will incur a 3% foreign transaction fee on all purchases made in a foreign currency or outside your home country. This fee is calculated as 3% of the total transaction amount and will be added to your statement in your home currency [6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262:6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262::sec1]. Additionally, if you make cash withdrawals from an ATM outside your home country, a $5 fixed fee will apply along with the 3% foreign transaction fee [6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262:6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262::sec3].

```json

--- NAT ---
 Yes, if you use your card abroad, you will incur a 3% foreign transaction fee on all purchases made in a foreign currency or outside your home country. This fee is calcula

In [20]:
import os, tempfile, time, json
from typing import Dict, Any, Optional

# --- 1) Transcribe an audio FILE with Whisper ---
def transcribe_audio_file(path: str) -> Dict[str, Any]:
    assert os.path.exists(path), f"Audio file not found: {path}"
    with open(path, "rb") as f:
        tr = oai.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json"
        )
    text = getattr(tr, "text", "") or ""
    segs = [dict(s) for s in getattr(tr, "segments", [])] if hasattr(tr, "segments") else []
    print(f"[STT:file] text_len={len(text)} segments={len(segs)}")
    return {"text": text, "segments": segs}

# --- 2) Capture mic → WAV → transcribe ---
def record_microphone(seconds: float = 8.0, samplerate: int = 16000) -> str:
    """
    Records from the default microphone for `seconds` and writes a temp WAV.
    Returns path to the WAV file.
    """
    import sounddevice as sd
    from scipy.io.wavfile import write as wav_write

    print(f"[REC] Recording {seconds}s @ {samplerate}Hz… Speak now.")
    audio = sd.rec(int(seconds * samplerate), samplerate=samplerate, channels=1, dtype='int16')
    sd.wait()
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wav_write(tmp.name, samplerate, audio)
    print(f"[REC] Saved: {tmp.name}")
    return tmp.name

def stt_to_answer_from_file(path: str, graph_hops: int = 1) -> Dict[str, Any]:
    stt = transcribe_audio_file(path)
    q = stt["text"].strip()
    if not q:
        print("[STT] Empty transcript.")
        return {"transcript": "", "answer": "", "json": {}}
    print(f"[PIPELINE] User said: {q}")
    nat, js = answer_query(q, graph_hops=graph_hops)
    return {"transcript": q, "answer": nat, "json": js}

def stt_to_answer_from_mic(seconds: float = 8.0, graph_hops: int = 1) -> Dict[str, Any]:
    wav_path = record_microphone(seconds=seconds)
    try:
        return stt_to_answer_from_file(wav_path, graph_hops=graph_hops)
    finally:
        try: os.remove(wav_path)
        except: pass

In [22]:
pip install sounddevice scipy

Collecting sounddevice
  Downloading sounddevice-0.5.2-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl.metadata (1.6 kB)
Downloading sounddevice-0.5.2-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl (108 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.2
Note: you may need to restart the kernel to use updated packages.


In [25]:
# --- Mic demo: speak a question like "Do I pay a foreign transaction fee abroad?" ---
result = stt_to_answer_from_mic(seconds=6, graph_hops=1)
print("\n[TRANSCRIPT]\n", result["transcript"])
print("\n[ANSWER]\n", result["answer"])
print("\n[JSON]\n", json.dumps(result["json"], indent=2)[:1200])

[REC] Recording 6s @ 16000Hz… Speak now.
[REC] Saved: /var/folders/6v/dnjkft556qngc04m0chvx21h0000gn/T/tmp8gogcbwh.wav
[STT:file] text_len=58 segments=1
[PIPELINE] User said: Does the foreign transactions fee apply to the cardholder?
[RETRIEVE] dense=5 kw=5 → fused=6
[GRAPH-RETRIEVE] seeds=5 expanded=5 final=5
[SYNTH] Answer:
Yes, a foreign transaction fee of 3% applies to all purchases made in a foreign currency or outside of the cardholder’s home country. This fee is calculated as 3% of the total transaction amount and will be added to the cardholder’s statement in their home currency [6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262:6. Foreign Transaction Fees and Currency Conversion Policy-35ea3262::sec1].

```json

[TRANSCRIPT]
 Does the foreign transactions fee apply to the cardholder?

[ANSWER]
 Yes, a foreign transaction fee of 3% applies to all purchases made in a foreign currency or outside of the cardholder’s home country. This fee is calculated as 3% of t