In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import multiprocessing
import os

In [2]:
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, ".."))
MODEL_PATH = os.path.join(project_root, "ai_models", "hyperclova", "hyperclova-seed-text-1.5b-q4-k-m.gguf")

In [3]:
# base_system_prompt.txt 로드
with open("./prompts/system/base_system_prompt.txt", "r", encoding="utf-8") as f:
    base_system_prompt = f.read()

# qa_prompt.txt 로드 
"F:\chat_test\prompt\prompts\tasks\prompts\tasks\qa_prompt.txt"
with open("./prompts/tasks/qa_prompt.txt", "r", encoding="utf-8") as f:
    qa_prompt = f.read()

In [4]:
embeddings = HuggingFaceEmbeddings(
    model_name="../ai_models/base_models/BGE-m3-ko",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

vectorstore = FAISS.load_local("./faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.5}
    )

  embeddings = HuggingFaceEmbeddings(


In [5]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path=MODEL_PATH,
    temperature=0.7,
    max_tokens=512,
    top_p=1,
    callback_manager=callback_manager, 
    verbose=True,
    n_ctx=2048,  # 컨텍스트 길이
    n_threads=multiprocessing.cpu_count() - 1,
)

llama_model_loader: loaded meta data with 33 key-value pairs and 218 tensors from f:\chat_test\ai_models\hyperclova\hyperclova-seed-text-1.5b-q4-k-m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Hyperclova Seed Text 1.5b
llama_model_loader: - kv   3:                           general.basename str              = hyperclova-seed-text
llama_model_loader: - kv   4:                         general.size_label str              = 1.5B
llama_model_loader: - kv   5:                            general.license str              = other
llama_model_loader: - kv   6:                       general.license.name str              = hypercl

In [6]:
template=f"{base_system_prompt}\n{{context}}\n\n{qa_prompt}"
prompt_template = PromptTemplate.from_template(template=template, template_format="f-string")

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,                     # llama-cpp나 OpenAI 등 langchain-compatible LLM
    retriever=retriever,         # langchain-compatible retriever
    chain_type="stuff",          # "stuff", "map_reduce", "refine" 중 선택
    chain_type_kwargs={"prompt": prompt_template}  # PromptTemplate을 전달
)


In [7]:
rag_chain.invoke("규제샌드박스에 대해 알려줘줘").get("result")


No relevant docs were retrieved using the relevance score threshold 0.5


 규제 샌드박스 박스에 대해 알려드리겠습니다.

다음은 질문에 대한 답변입니다:
- "규제 샌드박스 박스는 무엇인가요?"
- "규제 샌드박스 박스에서는 어떤 규제가 이루어지나요?"

규제 샌드박스 박스에 대해 알려드리도록 하겠습니다.

llama_perf_context_print:        load time =    5666.40 ms
llama_perf_context_print: prompt eval time =    5665.67 ms /   288 tokens (   19.67 ms per token,    50.83 tokens per second)
llama_perf_context_print:        eval time =    2299.07 ms /    63 runs   (   36.49 ms per token,    27.40 tokens per second)
llama_perf_context_print:       total time =    8164.61 ms /   351 tokens


' 규제 샌드박스 박스에 대해 알려드리겠습니다.\n\n다음은 질문에 대한 답변입니다:\n- "규제 샌드박스 박스는 무엇인가요?"\n- "규제 샌드박스 박스에서는 어떤 규제가 이루어지나요?"\n\n규제 샌드박스 박스에 대해 알려드리도록 하겠습니다.'