# Graph - RAG chatbot

# Insert information into vector and graph database

### Initializing databases and parser

In [None]:
import os
import json
import uuid
from pathlib import Path
from typing import List, Dict, Any
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import psycopg2
from neo4j import GraphDatabase
from openai import OpenAI

load_dotenv()

INPUT_DIR = os.environ.get("INPUT_DIR", "data")
EMBED_MODEL = os.environ.get("EMBED_MODEL", "text-embedding-3-small")
EMBED_DIM = int(os.environ.get("EMBED_DIM", "1536"))
LLAMA_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PG = {
    "host": os.environ.get("POSTGRES_HOST", "localhost"),
    "port": int(os.environ.get("POSTGRES_PORT", "5432")),
    "dbname": os.environ.get("POSTGRES_DB", "lawdb"),
    "user": os.environ.get("POSTGRES_USER", "postgres"),
    "password": os.environ.get("POSTGRES_PASSWORD", "postgres"),
}
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.environ.get("NEO4J_USERNAME", "neo4j")
NEO4J_PASS = os.environ.get("NEO4J_PASSWORD", "neo4j")
SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".xlsx", ".csv"}

def mk_uuid() -> uuid.UUID:
    return uuid.uuid4()

### Parse document

In [None]:
def embed_texts(client: OpenAI, texts: List[str]) -> List[List[float]]:
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return [d.embedding for d in resp.data]

def parse_with_llamaparse(input_dir: str) -> List[Dict[str, Any]]:
    parser = LlamaParse(api_key=LLAMA_API_KEY, result_type="json", language="no", num_workers=4)
    file_extractor = {ext: parser for ext in SUPPORTED_EXTS}
    docs = SimpleDirectoryReader(input_dir=input_dir, file_extractor=file_extractor).load_data()
    per_page_chunks: List[Dict[str, Any]] = []
    for d in docs:
        meta = d.metadata or {}
        file_name = meta.get("file_name") or meta.get("filename") or "unknown"
        title = meta.get("title")
        page_count = meta.get("page_count")
        json_obj = None
        if d.text:
            try:
                json_obj = json.loads(d.text)
            except Exception:
                json_obj = None
        if json_obj and isinstance(json_obj, dict) and "pages" in json_obj:
            pages = json_obj.get("pages") or []
            for p in pages:
                page_number = p.get("page_number")
                text = p.get("text") or ""
                if not text.strip():
                    continue
                per_page_chunks.append({
                    "file_name": file_name,
                    "title": title,
                    "page_count": page_count,
                    "page_number": page_number,
                    "chunk_index": 0,
                    "text": text,
                })
        else:
            text = d.text or ""
            if text.strip():
                per_page_chunks.append({
                    "file_name": file_name,
                    "title": title,
                    "page_count": page_count,
                    "page_number": None,
                    "chunk_index": 0,
                    "text": text,
                })
    return per_page_chunks

### Functions for inserting into database

In [None]:
def ensure_document(cur, doc_row):
    cur.execute(
        """
        INSERT INTO documents (id, file_name, title, page_count)
        VALUES (%s, %s, %s, %s)
        ON CONFLICT (id) DO UPDATE SET
          file_name = EXCLUDED.file_name,
          title = EXCLUDED.title,
          page_count = EXCLUDED.page_count
        """,
        (str(doc_row["id"]), doc_row["file_name"], doc_row["title"], doc_row["page_count"]),
    )

def insert_chunk(cur, chunk_row):
    vec = "[" + ",".join(f"{x:.8f}" for x in chunk_row["embedding"]) + "]"
    cur.execute(
        """
        INSERT INTO doc_chunks (id, doc_id, page_number, chunk_index, chunk_text, embedding)
        VALUES (%s, %s, %s, %s, %s, %s)
        ON CONFLICT (id) DO NOTHING
        """,
        (
            str(chunk_row["id"]),
            str(chunk_row["doc_id"]),
            chunk_row["page_number"],
            chunk_row["chunk_index"],
            chunk_row["text"],
            vec,
        ),
    )

def neo4j_upsert_document(session, doc_row):
    session.run(
        """
        MERGE (d:Document {id: $id})
        SET d.file_name = $file_name,
            d.title = $title,
            d.page_count = $page_count
        """,
        id=str(doc_row["id"]),
        file_name=doc_row["file_name"],
        title=doc_row["title"],
        page_count=doc_row["page_count"],
    )

def neo4j_upsert_page_and_chunk(session, doc_row, chunk_row):
    session.run(
        """
        MATCH (d:Document {id: $doc_id})
        MERGE (p:Page {doc_id: $doc_id, number: $page_number})
          ON CREATE SET p.created_at = timestamp()
        MERGE (d)-[:HAS_PAGE]->(p)
        MERGE (c:Chunk {doc_id: $doc_id, page_number: $page_number, chunk_index: $chunk_index})
          ON CREATE SET c.created_at = timestamp()
        SET c.text = $text,
            c.length = $length
        MERGE (p)-[:HAS_CHUNK {index: $chunk_index}]->(c)
        """,
        doc_id=str(doc_row["id"]),
        page_number=chunk_row["page_number"],
        chunk_index=chunk_row["chunk_index"],
        text=chunk_row["text"],
        length=len(chunk_row["text"]),
    )

In [None]:
def main():
    chunks = parse_with_llamaparse(INPUT_DIR)
    if not chunks:
        return
    from collections import defaultdict
    by_file: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
    for ch in chunks:
        by_file[ch["file_name"]].append(ch)
    doc_map: Dict[str, uuid.UUID] = {fname: mk_uuid() for fname in by_file.keys()}
    client = OpenAI(api_key=OPENAI_API_KEY)
    texts = [c["text"] for c in chunks]
    embs = embed_texts(client, texts)
    for c, e in zip(chunks, embs):
        c["embedding"] = e
    pg_conn = psycopg2.connect(**PG)
    pg_conn.autocommit = False
    neo_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
    try:
        with pg_conn, pg_conn.cursor() as cur, neo_driver.session() as neo_sess:
            for fname, group in by_file.items():
                doc_id = doc_map[fname]
                first = group[0]
                doc_row = {
                    "id": doc_id,
                    "file_name": fname,
                    "title": first.get("title"),
                    "page_count": first.get("page_count"),
                }
                ensure_document(cur, doc_row)
                neo4j_upsert_document(neo_sess, doc_row)
                for ch in group:
                    chunk_row = {
                        "id": mk_uuid(),
                        "doc_id": doc_id,
                        "page_number": ch.get("page_number"),
                        "chunk_index": ch.get("chunk_index", 0),
                        "text": ch["text"],
                        "embedding": ch["embedding"],
                    }
                    insert_chunk(cur, chunk_row)
                    neo4j_upsert_page_and_chunk(neo_sess, doc_row, chunk_row)
        pg_conn.commit()
    except Exception:
        pg_conn.rollback()
        raise
    finally:
        neo_driver.close()
        pg_conn.close()

if __name__ == "__main__":
    main()