In [None]:
import sys
import os

sys.path.append("..")

DATABASE_DIR_MANIFESTOS = "../../data/manifestos/chroma/openai"
DATABASE_DIR_DEBATES = "../../data/debates/chroma/openai"
TEST_DATA_PATH = "../../data/questions/eval_questions.csv"

# Here we define the experimental run! This is important for the filenames, etc!

In [None]:
experiment_name = "3_retrieval_fetch_k"
experiment_run_name = "3e_fetch_k_3"

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm


from models.RAG import RAG
from database.vector_database import VectorDatabase
from models.embedding import ManifestoBertaEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI


from datasets import Dataset, DatasetDict, load_from_disk

In [None]:
def format_response(response):
    q = response[next(iter(response))]["question"]
    c = {party: response[party]["context"] for party in response.keys()}
    d = {
        source: {party: response[party]["docs"][source]
                 for party in response.keys()}
        for source in response[list(response.keys())[0]]["docs"].keys()
    }
    a = {party: response[party]["answer"] for party in response.keys()}
    response = {"question": q, "context": c, "docs": d, "answer": a}
    return response

# Create chain

In [None]:
from models.generation import generate_chain

# Select an embedding_model
########################################################################
# embedding_model = ManifestoBertaEmbeddings()
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")


# Select an LLM
#########################################################################
LARGE_LANGUAGE_MODEL = ChatOpenAI(
    model_name="gpt-3.5-turbo", max_tokens=300, temperature=0
)

# LARGE_LANGUAGE_MODEL = ChatMistralAI(
#     name="open-mixtral-8x7b", max_tokens=300, temperature=0
# )

# LARGE_LANGUAGE_MODEL = ChatAnthropic(
#     model_name="claude-3-haiku-20240307", max_tokens=300, temperature=0
# )


db_manifestos = VectorDatabase(
    data_path="../data/manifestos",
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_MANIFESTOS,
    source_type="manifestos",
)

db_debates = VectorDatabase(
    data_path="../data/debates",
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_DEBATES,
    source_type="debates",
)

chain = generate_chain(
    [db_manifestos, db_debates],
    llm=LARGE_LANGUAGE_MODEL,
    language="Deutsch",
    k=3,
)


# rag = RAG(
#     [db_manifestos, db_debates], llm=LARGE_LANGUAGE_MODEL, k=3, language="Deutsch"
# )

# Create dataset with question, context, and answer for chain

In [None]:
recreate_dataset = True
if recreate_dataset:
    # Create dataset with question, context and answer

    # Load all test questions
    # df_test_simple = pd.read_csv(os.path.join(TEST_DATA_DIR, "simple_questions.csv"))
    # df_test_complex = pd.read_csv(os.path.join(TEST_DATA_DIR, "complex_questions.csv"))
    df_test = pd.read_csv(TEST_DATA_PATH)

    # Here we select, which parties we want to evaluate
    parties = ["afd", "spd", "linke", "gruene", "fdp", "cdu"]

    # Here we select the questions that should be evaluated
    questions = df_test["question"]

    # Dictionary to save results
    dataset_dict = {
        party: {"question": [], "contexts": [], "answer": []} for party in parties
    }

    # Loop through all questions and get chain answer
    for question in tqdm(questions):
        print(question)
        response = chain.invoke(question)
        response = format_response(response)

        # for pary in parties:
        for party in parties:
            contexts = [
                doc.page_content for doc in response["docs"]["manifestos"][party]
            ]
            answer = response["answer"][party]

            dataset_dict[party]["question"].append(question)
            dataset_dict[party]["contexts"].append(contexts)
            dataset_dict[party]["answer"].append(answer)

    dataset = DatasetDict(
        {party: Dataset.from_dict(dataset_dict[party]) for party in parties}
    )

    # Save
    dataset.save_to_disk(
        f"Experiments/{experiment_name}/dataset_{experiment_run_name}")

In [None]:
dataset = load_from_disk(f"Experiments/{experiment_name}/dataset_{experiment_run_name}")

# Import Ragas and translate prompts to German

In [None]:
import ragas
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy
from evaluation import Evaluator

# llm used for adaptation
openai_model_adaption = ChatOpenAI(model_name="gpt-4")
ragas.adapt(
    metrics=[faithfulness, answer_relevancy, context_relevancy],
    language="german",
    llm=openai_model_adaption,
)

evaluator = Evaluator()

# Compute metrics with ragas

In [None]:
from ragas.run_config import RunConfig

# Configuration for ragas.evaluate
run_config = RunConfig()
run_config.max_retries = 3
run_config.max_timeout = 60  # in s
run_config.max_wait = 20  # in s

num_questions = len(dataset["afd"])
questions = [dataset["fdp"][i]["question"] for i in range(num_questions)]

results_dict = {}
# Define which parties should be evaluated
parties = ["afd", "spd", "linke", "gruene", "fdp", "cdu"]

# Define which metrics should be evaluated
list_of_metrics = [faithfulness, answer_relevancy]

for i in tqdm(range(num_questions)):
    results_dict.update({questions[i]: {}})
    print(f"Question: {questions[i]}")
    for party in parties:
        # print(f"Evaluating party: {party}")
        # ragas_score = ragas.evaluate(
        #     dataset[party].select([i]), metrics=list_of_metrics, run_config=run_config
        # )
        ragas_score = evaluator.context_relevancy(dataset[party].select([i]))

        results_dict[questions[i]].update({party: ragas_score})
    print(results_dict[questions[i]])

In [None]:
# Flatten the dictionary
flattened_data = {}
for question, parties in results_dict.items():
    for party, metrics in parties.items():
        for metric, value in metrics.items():
            flattened_data[(question, party, metric)] = value

# Create a multi-index DataFrame
index = pd.MultiIndex.from_tuples(flattened_data.keys())
df = pd.Series(flattened_data, index=index).unstack().unstack()

file_name = f"Experiments/{experiment_name}/metrics_{experiment_run_name}.csv"

df.to_csv(file_name)

In [None]:
# Check if file saved correctly
df = pd.read_csv(file_name, header=[0, 1], index_col=[0, 1])
df.head()