In [None]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install langchain-teddynote
from langchain_teddynote import logging

# API 키 정보 로드
load_dotenv()

# 프로젝트 이름을 입력합니다.
# logging.langsmith("rag_evaluation")

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import csv

file_path = "../data/csv_data/rental_data_with_null.csv"


def get_csv_headers(file_path):
    with open(file_path, mode="r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)  # 첫 번째 줄(헤더) 가져오기
    return headers


headers = get_csv_headers(file_path)

# CSV 로더 생성
loader = CSVLoader(
    file_path=file_path,
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": headers,
    },
    # source_column="place",
    content_columns=headers,
    metadata_columns=[
        "place",
        "oneroom_half_year",
        "oneroom_year",
        "tworoom_half_year",
        "tworoom_year",
    ],
)
docs = loader.load()
print(docs[1].metadata)

In [None]:
i = 1

for doc in docs[i:]:
    row = doc.page_content.split("\n")
    row_str = "<row>"
    for element in row:
        splitted_element = element.split(":")
        value = splitted_element[-1]
        col = ":".join(splitted_element[:-1])
        row_str += f"<{col}>{value.strip()}</{col}>"
    row_str += "</row>\n\n"

    docs[i].page_content = row_str
    i += 1
    # print(ret[i].page_content)
    # ret += row_str

ret = docs

In [None]:
# # 각 문서에 파일 이름을 추가합니다.
# for i in ret[1:]:
#     i.metadata["filename"] = i.metadata["source"]

ret[1].metadata

In [None]:
from ragas.testset.persona import Persona

persona_manager = Persona(
    name="Manager",
    role_description="Wants to know about the information related to my own room.",
)

personas = [persona_manager]
personas

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings(model="text-embedding-3-small")
)

In [None]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(
    llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas
)

# dataset = generator.generate_with_langchain_docs(ret[1:], testset_size=10)

In [None]:
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)

distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]

for query, _ in distribution:
    prompts = await query.adapt_prompts("korean", llm=generator_llm)
    query.set_prompts(**prompts)

In [None]:
dataset = generator.generate_with_langchain_docs(
    ret[1:], testset_size=10, query_distribution=distribution
)

In [None]:
dataset.to_pandas().head()

In [None]:
dataset.to_pandas().to_csv("../data/csv_data/new_ragas_dataset_1.csv", index=False)

In [27]:
from datasets import load_dataset

# 절대 경로 설정
csv_file_path = "../data/csv_data/new_ragas_dataset.csv"

# 로컬 CSV 파일로 데이터셋 로드
dataset = load_dataset("csv", data_files=csv_file_path)

# 데이터셋 정보 출력
print(dataset)



# from datasets import load_dataset
# dataset = load_dataset(
#     "explodinggradients/amnesty_qa",
#     "english_v3",
# )
# print(dataset)
# print(dataset["eval"][0])

DatasetDict({
    train: Dataset({
        features: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name'],
        num_rows: 10
    })
})


In [28]:
from ragas import EvaluationDataset

eval_dataset = EvaluationDataset.from_hf_dataset(dataset["train"])

ValidationError: 1 validation error for SingleTurnSample
reference_contexts
  Input should be a valid list [type=list_type, input_value="['<row><name>우리오...oom_year></row>\\n\\n']", input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/list_type

In [None]:
from ragas.metrics import (
    LLMContextRecall,
    Faithfulness,
    FactualCorrectness,
    SemanticSimilarity,
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas import evaluate

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings(model="text-embedding-3-small")
)

metrics = [
    LLMContextRecall(llm=evaluator_llm),
    FactualCorrectness(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings),
]

results = evaluate(dataset=eval_dataset, metrics=metrics)

ValueError: The metric [factual_correctness] that is used requires the following additional columns ['response'] to be present in the dataset.