In [2]:
from glob import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import BSHTMLLoader
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import os
import pandas as pd
from charset_normalizer import detect

# Step 1: Load environment variables
load_dotenv()

# Step 2: Test parameters
chunk_sizes = [500, 1000, 1500]
chunk_overlaps = [50, 100, 150, 200]
results = []

# Step 3: PDF and HTML file loading
def load_documents():
    pdf_docs = []
    pdfs = glob('./췌장암_pdf/*.pdf')
    for pdf in pdfs:
        loader = PyMuPDFLoader(pdf)
        try:
            documents = loader.load()
            non_empty_docs = [doc for doc in documents if doc.page_content.strip()]
            pdf_docs.extend(non_empty_docs)
        except Exception as e:
            print(f"Error: {pdf}: {e}")
    
    html_docs = []
    success_count = 0
    html_files = glob('./췌장암_html/*.html')

    for html_file in html_files:
        try:
            # 인코딩 자동 감지
            with open(html_file, 'rb') as f:
                raw_data = f.read()
                detected_encoding = detect(raw_data)['encoding']

            # 감지된 인코딩으로 파일 로드
            loader = BSHTMLLoader(html_file, open_encoding=detected_encoding)
            documents = loader.load()
            success_count += 1
            html_docs.extend(documents)
        except Exception as e:
            print(f"Error loading HTML {html_file}: {e}")

    print(f'인코딩 된 HTML파일 수: {success_count}')
    
    return pdf_docs, html_docs

pdf_docs, html_docs = load_documents()

# Step 4: Test different chunk sizes and overlaps
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
llm = ChatOpenAI(temperature=0, model='gpt-4o-mini')

for chunk_size in chunk_sizes:
    for chunk_overlap in chunk_overlaps:
        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # Split the documents
        pdf_splits = text_splitter.split_documents(pdf_docs)
        html_splits = text_splitter.split_documents(html_docs)
        
        # Combine and persist documents into Chroma
        vectordb = Chroma(embedding_function=embedding, collection_name=f"pancreas_{chunk_size}_{chunk_overlap}")
        vectordb.add_documents(pdf_splits)
        vectordb.add_documents(html_splits)
        
        # Setup the retriever
        retriever_from_llm = MultiQueryRetriever.from_llm(
            retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
            llm=llm
        )
        
        # Setup the chain
        template = """
        Your response should be in JSON format.
        You are an empathetic chatbot designed to provide information and support regarding diseases.
        You aim to maintain a consistent and friendly style while ensuring that explanations are easy for anyone to understand.
        When explaining your symptoms and how to deal with them, please answer clearly and accurately. Depending on the situation, use jargon to explain, but don't lie, be precise and detailed.
        You will do best to provide appropriate answers to questions.
        Your goal is to provide the answers I seek and offer the assistance.
        At the end of each response, You'll provide key terms related to the question, including diseases and medications.
        Put a line break after the period.
        Please refrain from unnecessary words.
        Please answer me only once.
        don't say it over and over again
        Don't use repeated phrases.
        Don't rewrite the question at the end.
        Don't answer anything after writing the keywords.
        Don't add questions at the end.
        Stop answering after listing keyword words.
        Please answer in 1000 words or less.
        Please must reply in Korean.
        you are designed to output json.


        Answer the question based only on the following context: {context}

        Question: {input}
        Output Format (JSON)
        {{
        "question": "Write the original user-submitted questions.",
        "answer": "Full response to original question.",
        "sources": "When writing your answer, organize it into sentences and record the context in which you wrote it.",
        "source_documents": [
            {{
            "title": "The title of the source document.",
            "page": "The page number of the source document.",
            "content": "The relevant content extracted from the source document.",
            "url": "The URL or file path of the source document."
            }}]
        }}
        """
        prompt = ChatPromptTemplate.from_template(template)
        setup_and_retrieval = RunnableParallel(
            {"context": retriever_from_llm, "input": lambda x: x}
        )
        chain = setup_and_retrieval | prompt | llm | StrOutputParser()
        
        # Test queries
        queries = [
            "췌장암의 원인은 무엇인가요?",
            "췌장암 1기와 2기 3기 차이를 알려주세요",
            "췌장암이 유전과 관련이 있을 가능성이 있나요? 가족들에게 유전자 검사를 권해야 할까요?",
            "췌장암 치료를 위해 현재 연구 중인 새로운 약물이나 임상시험에 참여할 수 있는 방법이 있나요?",
            "췌장암으로 인해 발생하는 통증이나 소화 문제를 완화하기 위해 사용할 수 있는 방법이 무엇인가요?",
            "췌장암이 당뇨병과 관련이 있다는 이야기를 들었는데, 제 혈당 관리가 암 치료에 어떤 영향을 미칠 수 있나요?",
            "치료 과정 중에도 일상생활을 최대한 유지하고 생활의 질을 높이기 위해 추천할 만한 활동이나 프로그램이 있을까요?",
        ]
        
        for query in queries:
            try:
                response = chain.invoke(query)
                results.append({
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap,
                    "query": query,
                    "response": response
                })
            except Exception as e:
                results.append({
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap,
                    "query": query,
                    "response": f"Error: {e}"
                })

# Step 5: Save results to Excel
df = pd.DataFrame(results)
df.to_excel("chunk_size_overlap_tests_2.xlsx", index=False)
print("Results saved to 'chunk_size_overlap_tests.xlsx'")




인코딩 된 HTML파일 수: 42
Results saved to 'chunk_size_overlap_tests.xlsx'
