In [None]:
# requirements.txt
# langchain, chromadb, openai, tiktoken, sentence-transformers, pandas

from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import json

OPEN_API_KEY = 'openai_api_key'

# JSON 문서 로드
with open('docs.json', encoding='utf-8') as f:
    raw_data = json.load(f)

documents = [d["text"] for d in raw_data]

# 문서 전처리
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
docs = splitter.create_documents(documents)

# 한국어 임베딩 모델
embedding_model = HuggingFaceEmbeddings(model_name="jhgan/ko-sroberta-multitask")

# 벡터스토어 생성
db = Chroma.from_documents(docs, embedding=embedding_model, persist_directory="./chroma_db")
db.persist()

# OpenAI 모델 로드
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=f"{OPEN_API_KEY}")

# RAG QA 체인 구성
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_type="similarity", k=3),
    return_source_documents=True
)

query = "개인정보 수집 시 필요한 법적 조건은?"
result = qa_chain(query)

print("답변:", result['result'])
print("출처 문서:", result['source_documents'])
