Document를 보고 질문에 대한 정답 document 인덱스를 만들어야 함.

In [45]:
import os
import pandas as pd
from tqdm import tqdm

import openai
from langchain_community.vectorstores import Milvus

from langchain_openai import OpenAIEmbeddings, ChatOpenAI




In [46]:
# 임베딩 모델이름
company = 'OPENAI'
embedding_models = 'text-embedding-3-large'
header_include = True
table_include = True
split_yn = False
documnet_name = "2024 IONIQ5"

In [47]:
exp_name = f"{company}_{embedding_models}_h{header_include}_t{table_include}_s{split_yn}"
exp_dir = '../experiment/result/' + exp_name

embedding_result_path = exp_dir + '/embeddings.parquet'
df = pd.read_parquet(embedding_result_path)

In [105]:
df = df[df['h1']!=df['doc_contents']]
df = df[df['h2']!=df['doc_contents']]
df = df[df['h3']!=df['doc_contents']]

df = df.reset_index(drop=True).reset_index()

#NaN 처리
df[['table_contents']] = df[['table_contents']].fillna('')
df['img_urls'] = df['img_urls'].apply(lambda d: d.tolist() if d is not None else [])
df['table_img_urls'] = df['table_img_urls'].apply(lambda d: d.tolist() if d is not None else [])

# LLM을 활용한 Evaluation Test Set 생성

In [62]:
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI


In [213]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(api_key=os.environ['OPENAI_API_KEY'], temperature=0, model="gpt-3.5-turbo")

In [246]:
retriever_prompt_template = """
\n\nHuman: Here is the context information, inside <context></context> XML tags.
Please don't make questions with the contents of the table.

<chapter>{chapter}</chapter>Given the chapter name
<majorheading>{majorheading}</majorheading>Given the major heading of chapter.
<minorheading>{minorheading}</minorheading>Given the minor heading of chapter.
<context>{context}</context>Given the context information and not prior knowledge.
generate only questions based on the below query.
You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \quiz/examination.
The questions should be diverse in nature \across the document.
The questions should not contain options, start with "-"
Restrict the questions to the context information provided.
Write in Korean. 

\n\nAssistant:"""

PROMPT_RETRIEVER = PromptTemplate(
    template=retriever_prompt_template,
    input_variables=["context", "num_questions_per_chunk"]
)

In [247]:
generation_prompt_template = """
Here is the context, inside <context></context> XML tags.

<context>
{context}
</context>
Only using the context as above, answer the following question with the rules as below:
    - Don't insert XML tag such as <context> and </context> when answering.
    - Write as much as you can
    - Be courteous and polite
    - Only answer the question if you can find the answer in the context with certainty.
    - Skip the preamble
    - Use three sentences maximum and keep the answer concise.
    - If the answer is not in the context, just say "Could not find answer in given contexts."
    - The each answers should start with "-"
    - Answer in Korean.
Question:
{question}
Answer:"""

PROMPT_GENERATION = PromptTemplate(
    template=generation_prompt_template,
    input_variables=["context", "question"]
)

In [248]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain


chain1 = LLMChain(llm=llm,prompt=PROMPT_RETRIEVER,output_key="question",verbose=False)
chain2 = LLMChain(llm=llm,prompt=PROMPT_GENERATION,output_key="answer", verbose=False)

chain = SequentialChain(chains=[chain1,chain2],
                        input_variables=["chapter", "majorheading", "minorheading", "context","num_questions_per_chunk"],
                        output_variables=['context', 'question','answer'],verbose=False)

In [249]:
df['question'] = ''
df['answer'] = ''

In [258]:
def make_qa_from_pandas(pdf):
    for i, row in tqdm(pdf.iterrows()):
        chapter = row['h1']
        majorheading = row['h2']
        minorheading = row['h3']
        context = row['doc_contents']
        if len(context)<512:
            num_questions_per_chunk = 1
        elif (len(context)>=512) and (len(context)<1024):
            num_questions_per_chunk = 2
        else:
            num_questions_per_chunk = 3
        
        result = chain({'chapter': chapter, 'majorheading': majorheading, 'minorheading': minorheading, 'context': context,'num_questions_per_chunk':num_questions_per_chunk})
        question = result['question']
        answer = result['answer']
            
        df.loc[i, 'question'] = question
        df.loc[i, 'answer'] = answer
    return df

In [250]:
bad_request_index = []
for i, row in tqdm(df.iterrows()):
    chapter = row['h1']
    majorheading = row['h2']
    minorheading = row['h3']
    context = row['doc_contents']
    table_img_urls = row['table_img_urls']
    if not table_img_urls:
        if len(context)<512:
            num_questions_per_chunk = 1
        elif (len(context)>=512) and (len(context)<1024):
            num_questions_per_chunk = 2
        else:
            num_questions_per_chunk = 3
        
        result = chain({'chapter': chapter, 'majorheading': majorheading, 'minorheading': minorheading, 'context': context,'num_questions_per_chunk':num_questions_per_chunk})
        question = result['question']
        answer = result['answer']
    else:
        continue
        
    df.loc[i, 'question'] = question
    df.loc[i, 'answer'] = answer
    if i==10:
        break


10it [00:50,  5.05s/it]


In [261]:
print(df.loc[3]['question'])

- 전기 자동차의 주요 장치 중 하나인 OBC는 무엇을 하는 장치인가요?
- 고전압 부품 및 구동용(고전압) 배터리를 분리하거나 손상시키면 어떤 위험성이 있을까요?


In [262]:
print(df.loc[3]['answer'])


- OBC는 구동용(고전압) 배터리를 충전하는 장치입니다.
- 고전압 부품 및 구동용(고전압) 배터리를 분리하거나 손상시키면 감전 등의 사고가 발생해 심각한 부상을 입을 수 있으며 차량의 성능 및 내구에 영향을 줄 수 있습니다. 
- 고전압 부품 및 구동용(고전압) 배터리의 점검 및 정비가 필요할 경우 당사 직영하이테크센터나 블루핸즈에서 점검을 받아야 합니다.


In [263]:
df[['h1', 'h2', 'h3', 'doc_contents', 'question', 'answer']].loc[:10].to_csv("~/test.csv",index=False, encoding='cp949')

# RAGAS를 위한 custom dataloader 생성

In [50]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

In [30]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="doc_table_contents")
documents = loader.load()


In [32]:
# OpenAI 임베딩 모델 설정
embedding_model = OpenAIEmbeddings(model=embedding_models,
                                  openai_api_key=os.environ['OPENAI_API_KEY'])
llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0)  # Modify model_name if you have access to GPT-4


In [33]:

generator = TestsetGenerator.from_langchain(
    generator_llm=llm,
    critic_llm=llm,
    embeddings=embedding_model
)

In [None]:


testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

test_df = testset.to_pandas()
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()
test_df.head()