In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["LLAMA_SET_ROWS"] = "131072"

In [2]:
# %pip install --no-cache-dir llama-cpp-python==0.3.13 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
# %pip install --no-cache-dir llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

In [3]:
import torch
import pandas as pd
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFDirectoryLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import multiprocessing
from langchain_community.chat_models import ChatLlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# RAG 체인 구성
from langchain.chains import RetrievalQA



print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print((torch.cuda._get_nvml_device_index(0)))

False


  return torch._C._cuda_getDeviceCount() > 0


RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 803: system has unsupported display driver / cuda driver combination

In [None]:
# Path 객체에는 -r 옵션(재귀적 검색)이 없습니다.
# 만약 특정 디렉토리 내에서 재귀적으로 파일을 찾고 싶다면 glob 메서드나 rglob 메서드를 사용해야 합니다.
# 아래는 예시입니다.
# /AIHUB/PCNRND/home/chatbot/ai_models/base_models/embeddings/dragonkue/BGE-m3-ko
root_path = Path(".").resolve().parent.parent
embedding_model_path = root_path / "ai_models" / "base_models" / "embeddings" /"dragonkue" / "BGE-m3-ko"
print(embedding_model_path)

/AIHUB/PCNRND/home/chatbot/ai_models/base_models/embeddings/dragonkue/BGE-m3-ko


# 

In [5]:
# 임베딩 모델 초기화
embeddings = HuggingFaceEmbeddings(
    model_name=str(embedding_model_path),
    model_kwargs={'device': 'cuda:0'},
    encode_kwargs={'normalize_embeddings': True}
)   

In [6]:
faiss_index_directory = "./data/faiss_pdf_directory"
vectorstore = FAISS.load_local(faiss_index_directory, embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(    
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.5, "k": 20}
)

In [7]:
# /AIHUB/PCNRND/home/chatbot/ai_models/gguf_models/Bllossom/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf
llm_path = str(root_path / "ai_models" / "gguf_models" / "Bllossom" / "llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M" / "llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf")

print(llm_path)

/AIHUB/PCNRND/home/chatbot/ai_models/gguf_models/Bllossom/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf


In [8]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# RAG 시스템을 위한 LLM 설정 (KV 캐시 최적화 포함)
llm = ChatLlamaCpp(
    # backend="cuda", 
    temperature=0.1,  # RAG에서는 더 일관된 답변을 위해 낮은 temperature 사용
    model_path=llm_path,  # PosixPath를 문자열로 변환
    n_ctx=131072,  # 컨텍스트 크기를 적절히 조정 (32768은 메모리 부담이 큼)
    n_gpu_layers=-1,
    n_batch=512,  # KV 캐시 최적화를 위해 더 작은 배치 크기
    # max_tokens=512,  # 더 긴 답변을 위해 토큰 수 증가
    n_threads=multiprocessing.cpu_count() - 1,
    repeat_penalty=1.1,  # RAG에서는 적당한 반복 패널티
    top_p=0.9,  # RAG에서는 더 높은 top_p로 다양한 답변 생성
    callback_manager=callback_manager,
    
    # KV 캐시 최적화를 위한 설정들
    use_mlock=True,  # 메모리 잠금으로 성능 향상
    use_mmap=True,   # 메모리 맵핑 사용

)

llama_model_loader: loaded meta data with 33 key-value pairs and 255 tensors from /AIHUB/PCNRND/home/chatbot/ai_models/gguf_models/Bllossom/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 Korean Bllossom 3B
llama_model_loader: - kv   3:                           general.basename str              = llama-3.2-Korean-Bllossom
llama_model_loader: - kv   4:                         general.size_label str              = 3B
llama_model_loader: - kv   5:                            general.license str              = llama3.2
llama_model_loade

llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  29:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  30:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  31:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv  32:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   58 tensors
llama_model_loader: - type q4_K:  168 tensors
llama_model_loader: - type q6_K:   29 tensors
print_info: file format = GGUF V3 (latest)
prin

In [9]:
# /AIHUB/PCNRND/home/chatbot/ai_models/base_models/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf
# /AIHUB/PCNRND/home/chatbot/ai_models/base_models/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M/llama-3.2-Korean-Bllossom-3B-gguf-Q4_K_M.gguf
# /AIHUB/PCNRND/home/chatbot/ai_models/gguf_models/MLP-KTLim/llama-3-Korean-Bllossom-8B-gguf-Q4_K_M/llama-3-Korean-Bllossom-8B-Q4_K_M.gguf

In [10]:
# 프롬프트 구성
question = "피씨엔 AI 연구과제에 대해 알려줘"
prompt = f"<|im_start|>system\n당신은 도움이 되는 AI 어시스턴트입니다. 질문에 대해 3문장 정도 정확하고 유용한 답변을 제공해주세요.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

In [11]:
from sentence_transformers import CrossEncoder

In [12]:
# 2) 1차 검색기로부터 top_k 문서 가져오기
top_k = 20
retrieved_docs = vectorstore.similarity_search(question, k=top_k)

In [13]:
retrieved_docs

[Document(id='c5714078-54bd-4c07-b9f2-2488ed808e0a', metadata={'producer': 'Microsoft® PowerPoint® Microsoft 365용', 'creator': 'Microsoft® PowerPoint® Microsoft 365용', 'creationdate': '2025-06-09T09:29:11+09:00', 'title': 'PCN 회사소개서', 'author': 'pcn;이수아', 'subject': '', 'keywords': '', 'moddate': '2025-06-09T09:29:11+09:00', 'source': 'data/pdf/PCN_회사소개서_0609.pdf', 'total_pages': 36, 'page': 17, 'page_label': '18'}, page_content='18Digital Value Service Company Why PCN? 성공적으로 국가R&D 사업수행. 최근 빅데이터, Io T, AI 등 4차 산업과 관련된 기술 요구가 증가하면서 이와 관련된 국가 R&D도 확장되고 있습니다.  피씨엔은 이와 관련된 R&D 사업을 지속적으로 수행하고 있으며 그 과정에서 얻어진 기술 및 모듈을 활용하여 특허 등록, 솔루션화, 기술 이전 등 다양한 방향으로 사업화를 진행하면서 국가 R&D에 기여하고 있습니다'),
 Document(id='b16b86a9-dd16-4bc5-a081-5b830733fff2', metadata={'producer': 'Microsoft® PowerPoint® Microsoft 365용', 'creator': 'Microsoft® PowerPoint® Microsoft 365용', 'creationdate': '2025-06-09T09:29:11+09:00', 'title': 'PCN 회사소개서', 'author': 'pcn;이수아', 'subject': '', 'keywords': '', 'moddate': '2025-06-09T09:2

In [14]:
# 3) CrossEncoder 기반 Re-ranker 준비
# /AIHUB/PCNRND/home/chatbot/ai_models/base_models/embeddings/dragonkue/bge-reranker-v2-m3-ko
reranker_model_path = root_path / "ai_models" / "base_models" / "embeddings" / "dragonkue" / "bge-reranker-v2-m3-ko"
reranker = CrossEncoder(reranker_model_path, device='cuda:0')  # GPU 3번으로 설정

In [15]:
# 4) (query, passage) 쌍으로 점수 계산
pairs = [(question, doc.page_content) for doc in retrieved_docs]
scores = reranker.predict(pairs)

# 5) 점수를 기준으로 정렬
scored_docs = list(zip(retrieved_docs, scores))
scored_docs.sort(key=lambda x: x[1], reverse=True)

In [16]:
# 6) 상위 N개 선택
top_n = 5
final_docs = [doc for doc, score in scored_docs[:top_n]]

In [17]:
retriever = vectorstore.as_retriever(search_kwargs={"k": top_n})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # 로컬 LLM이면 교체
    retriever=retriever,
    return_source_documents=True
)


In [None]:
context = "\n\n".join([doc.page_content for doc in final_docs])
prompt = f"다음 문서를 참고하여 질문에 답변해줘:\n\n{context}\n\n질문: {question}"
result = qa_chain.invoke(prompt)