In [1]:
# 0. 라이브러리 임포트
# =============================
import os
import torch
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 문서 로딩 (PDF 또는 TXT)
# =============================
file_path = "RAG용 데이터셋.txt"    # 👉 txt/pdf 지원
ext = os.path.splitext(file_path)[1].lower()

if ext == ".pdf":
    loader = PyPDFLoader(file_path)
elif ext == ".txt":
    loader = TextLoader(file_path, encoding="utf-8")
else:
    raise ValueError(f"지원하지 않는 파일 형식: {ext}")

documents = loader.load()
print(f"총 {len(documents)} 문서 로딩 완료")

총 1 문서 로딩 완료


In [3]:
# 2. 텍스트 분할 (더 작게 쪼개기)
# =============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,   # ✅ 토큰 과다 방지
    chunk_overlap=30
)
docs = text_splitter.split_documents(documents)
print(f"총 {len(docs)} 청크 생성 완료")

# =============================
# 3. 임베딩 + 벡터DB
# =============================
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cpu"}   # ✅ CPU에서 임베딩
)

vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("gyeonggijeon_faiss_opt")
print("✅ 벡터스토어 저장 완료")

총 79 청크 생성 완료


  embeddings = HuggingFaceEmbeddings(


✅ 벡터스토어 저장 완료


In [None]:
# 4. LLaMA3.1 모델 로드 (4bit)
# =============================
HF_TOKEN = ""  # 본인 HuggingFace 토큰 입력
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)

bnb_config = {
    "load_in_4bit": True,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": True,
    "bnb_4bit_compute_dtype": torch.bfloat16,
}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",          # 자동 분산 로딩(GPU/CPU)
    torch_dtype=torch.bfloat16,
    token=HF_TOKEN,
    **bnb_config
)

rag_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=64,   # ✅ 답변 짧게 제한
    temperature=0.7,
    top_p=0.9
)

llm = HuggingFacePipeline(pipeline=rag_pipeline)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.35s/it]
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=rag_pipeline)


In [5]:
# 5. RAG QA 체인 구성 (stuff 체인)
# =============================
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})  # ✅ 문서 1개만 검색

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",     # ✅ map_reduce 대신 stuff → 속도 개선
    return_source_documents=False
)

In [115]:
# 6. 질의응답 실행
# =============================
query = "'태실'에는 예종대왕의 유골이 묻혀있나요?"
result = qa_chain.invoke({"query": query})

print("🙋 질문:", query)
print("📝 답변:", result["result"].strip())

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🙋 질문: '태실'에는 예종대왕의 유골이 묻혀있나요?
📝 답변: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

기전 정전과 조경묘만 남아 있다가 최근 들어 부속건물을 복원하여 지금의 조경묘가
된 것이지요.
전주가 조선왕조의 발원지임을 상징하는 조경묘~!! 잠시 관람 하신 후 어진 박
물관으로 안내해드리겠습니다

Question: '태실'에는 예종대왕의 유골이 묻혀있나요?
Helpful Answer: 안뇨. (I don't know.) The text doesn't mention '태실' at all. It talks about '조경묘' which seems to be the name of a historical site, and mentions that it was recently restored, but it doesn't mention anything about '태실' or the
