In [None]:
# TensorFlow와 GPU 관련 패키지 설치
pip install tensorflow tensorflow-gpu

# 기타 라이브러리 설치
pip install numpy pandas chardet faiss-gpu sentence-transformers langchain langchain-community pypdf pdfplumber
pip install langchain-ollama tf-keras


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.1.3 which is incompatible.
langchain 0.3.9 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.1.3 which is incompatible.
langchain-community 0.3.8 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.1.3 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.1.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting numpy<2,>=1.22.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
Successfully installed numpy-1.26.4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [None]:
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# PDF 로드
pdf_path = "/home/joonhai/DSDC/한화 개인용 자동차보험.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# 텍스트 청크 생성
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
chunks = text_splitter.split_documents(documents)
chunks = chunks[11:]  # 첫 11개 청크 제외 (필요 시 수정)

# 줄바꿈 제거 및 ID 부여
chunks = [
    Document(
        page_content=chunk.page_content.replace('\n', ' ').strip(),
        metadata=chunk.metadata
    ) for chunk in chunks
]
for i, chunk in enumerate(chunks):
    chunk.metadata['doc_id'] = i

print(f"생성된 텍스트 청크 수: {len(chunks)}")


생성된 청크 수: 702


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss

# 임베딩 생성
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# FAISS GPU 설정
res = faiss.StandardGpuResources()
cpu_index = FAISS.from_documents(documents=chunks, embedding=embedding).index
gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)

# GPU 기반 벡터 스토어 생성
vectorstore = FAISS(gpu_index, embedding)


In [None]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Ollama LLM 설정
llm = ChatOllama(model="llama3.1", temperature=0.8, num_predict=500)

# RAG 체인 구성
retrieval_prompt = ChatPromptTemplate.from_template("""
다음 컨텍스트를 바탕으로 질문에 답변해주세요. 관련 정보가 없으면 "답변할 수 없습니다."라고 답변해주세요.
컨텍스트: {context}
질문: {input}
답변:
""")
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_prompt)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
rag_chain = create_retrieval_chain(retriever, combine_docs_chain)


In [None]:
import pandas as pd
import chardet
from sentence_transformers import CrossEncoder

# CSV 데이터 로드
qa_csv_path = "/home/joonhai/DSDC/자동차보험_qa_dataset.csv"

# 인코딩 감지 및 데이터 로드
with open(qa_csv_path, 'rb') as file:
    result = chardet.detect(file.read())
print(f"감지된 인코딩: {result['encoding']}")

df_qa_test = pd.read_csv(qa_csv_path, encoding=result['encoding'])
df_filtered_qa = df_qa_test[df_qa_test['doc_id'].between(1, 100)]

print(f"필터링된 데이터셋 크기: {len(df_filtered_qa)}")

# 유사도 계산 함수 정의
def calculate_cross_encoder_similarity(query: str, prediction: str, model_name="BAAI/bge-reranker-v2-m3") -> float:
    cross_encoder_model = CrossEncoder(model_name, device="cuda")  # GPU 사용
    sentence_pairs = [[query, prediction]]
    similarity_scores = cross_encoder_model.predict(sentence_pairs)
    return similarity_scores[0]


In [None]:
from multiprocessing import Pool, cpu_count

# RAG 체인을 통한 문서 검색 및 답변 생성
def evaluate_doc_ids(df_qa_subset):
    results = []
    for _, row in df_qa_subset.iterrows():
        question = row['question']
        ground_truth = row['answer']
        response = rag_chain.invoke({"input": question})
        prediction = response['answer']
        similarity = calculate_cross_encoder_similarity(ground_truth, prediction)
        results.append({
            "doc_id": row['doc_id'],
            "question": question,
            "ground_truth": ground_truth,
            "prediction": prediction,
            "similarity": similarity
        })
    return pd.DataFrame(results)

# 병렬 처리
def process_batch(batch):
    return evaluate_doc_ids(batch)

batch_size = 10
batches = [df_filtered_qa.iloc[i:i + batch_size] for i in range(0, len(df_filtered_qa), batch_size)]

if __name__ == "__main__":
    print("병렬 처리를 시작합니다...")
    with Pool(cpu_count()) as pool:
        parallel_results = pool.map(process_batch, batches)

    # 결과 병합 및 저장
    df_parallel_results = pd.concat(parallel_results, ignore_index=True)
    output_path = "/home/joonhai/DSDC/parallel_results.csv"
    df_parallel_results.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"병렬 처리 완료: 결과 저장 경로 - {output_path}")
