In [None]:
import os, re, zipfile, subprocess
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
from lxml import etree
import fitz
from dotenv import load_dotenv, find_dotenv
from sentence_transformers import SentenceTransformer 
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document

In [None]:
CHROMA_DIR = "store/chroma"
EMBED_BACKEND = "openai"   # "openai" | "bge"
OPENAI_EMBED_MODEL = "text-embedding-3-small"
BGE_MODEL = "BAAI/bge-m3"

In [None]:
# 노트북/스크립트 어디서 실행하든 .env를 찾도록
HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
env_path = (HERE / ".env")
if env_path.exists():
    load_dotenv(dotenv_path=env_path, override=False)
else:
    # CWD 기준 탐색도 시도
    load_dotenv(find_dotenv(usecwd=True), override=False)

# 런타임에 실제로 잡혔는지 강제 확인(없으면 바로 오류)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.startswith("sk-"):
    raise RuntimeError("[ENV] OPENAI_API_KEY가 런타임에 설정되지 않았습니다. .env 위치/CWD/커널 확인 필요.")
print("[ENV] OPENAI_API_KEY loaded:", api_key[:10] + "…")

In [None]:
def get_embedder():
    if EMBED_BACKEND == "openai":
        return OpenAIEmbeddings(
            model=OPENAI_EMBED_MODEL,
            api_key=api_key, 
            max_retries=8,
            request_timeout=60,
        )
    else:
        class STEmb:
            def __init__(self, name): self.m = SentenceTransformer(name)
            def embed_documents(self, ts): return self.m.encode(ts, batch_size=32, normalize_embeddings=True).tolist()
            def embed_query(self, t): return self.m.encode([t], normalize_embeddings=True).tolist()[0]
        return STEmb(BGE_MODEL)

In [None]:
def build_retriever(filters: Dict[str, Any] | None = None, k: int = 6):
    emb = get_embedder()
    db = Chroma(collection_name="bidmate_rag", embedding_function=emb, persist_directory=CHROMA_DIR)
    retriever = db.as_retriever(search_kwargs={"k": k})
    # Chroma의 where 필터는 VectorStore.query에서 사용. LangChain retriever에선 직접 filter 전달 가능
    if filters:
        retriever.search_kwargs["filter"] = filters
    return retriever

In [None]:
def main():
    # LLM
    llm = ChatOpenAI(model="gpt-5-mini", temperature=0, api_key=api_key)

    # 예시 필터: 특정 기관 + 파일형식(pdf만)
    # filters = {"agency": "한국전력공사", "file_format": "pdf"}

    emb = get_embedder()  # OpenAIEmbeddings 또는 bge STEmb
    db = Chroma(collection_name="bidmate_rag", embedding_function=emb, persist_directory=CHROMA_DIR)

    retriever = db.as_retriever(search_kwargs={"k": 6})

    while True:
        q = input("\nQ> ").strip()
        if not q:
            break
        docs = retriever.invoke(q)

        context = ""
        used = set()
        for d in docs[:5]:
            fn = d.metadata.get("filename")
            tag = f"[{fn}|{d.metadata.get('agency')}|{d.metadata.get('published_at')}]"
            if fn in used:  # 중복 방지
                continue
            used.add(fn)
            context += f"{tag}\n{d.page_content[:1200]}\n\n"

        prompt = (
            "다음 컨텍스트만 근거로 한국어로 정확하고 간결하게 답하세요. "
            "반드시 근거로 사용한 출처를 대괄호로 표시하세요.\n\n"
            f"{context}\n질문: {q}\n\n답변:"
        )
        ans = llm.invoke(prompt).content
        print("\nA>", ans)
        print("\n[SOURCES]")
        for d in docs[:5]:
            print("-", d.metadata.get("filename"), d.metadata.get("agency"), d.metadata.get("published_at"))


if __name__ == "__main__":
    main()
