In [5]:
import sys
import os

sys.path.append(os.path.abspath(".."))

DATABASE_DIR_MANIFESTOS = "../../data/manifestos/chroma/openai"
DATABASE_DIR_DEBATES = "../../data/manifestos/chroma/openai"
TEST_DATA_DIR = "../../data/questions"

In [21]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from database.vector_database import VectorDatabase
from models.embedding import ManifestoBertaEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI


from datasets import Dataset, DatasetDict

# Create simple chain

In [7]:
from models.generation import generate_chain_with_balanced_retrieval

# embedding_model = ManifestoBertaEmbeddings()
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

LARGE_LANGUAGE_MODEL = ChatOpenAI(
    model_name="gpt-3.5-turbo", max_tokens=2000, temperature=0
)

# LARGE_LANGUAGE_MODEL = ChatMistralAI(
#     name="open-mixtral-8x7b", max_tokens=2000, temperature=0
# )


db_manifestos = VectorDatabase(
    data_path="../data/manifestos",
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_MANIFESTOS,
    source_type="manifestos",
)

chain = generate_chain_with_balanced_retrieval(
    [db_manifestos],
    llm=LARGE_LANGUAGE_MODEL,
    return_context=True,
    language="Deutsch",
    k=5,
)

reloaded database


In [27]:
# Create dataset with question, context and answer

df_test_simple = pd.read_csv(os.path.join(TEST_DATA_DIR, "simple_questions.csv"))
df_test_complex = pd.read_csv(os.path.join(TEST_DATA_DIR, "complex_questions.csv"))

parties = ["afd", "spd", "linke", "gruene", "fdp", "cdu"]

questions = df_test_simple["EINFACHE FRAGEN"]

dataset_dict = {
    party: {"question": [], "contexts": [], "answer": []} for party in parties
}
for question in tqdm(questions):
    response = chain.invoke(question)

    # for pary in parties:
    for party in parties:
        contexts = [doc.page_content for doc in response["docs"]["manifestos"][party]]
        answer = response["answer"][party]

        dataset_dict[party]["question"].append(question)
        dataset_dict[party]["contexts"].append(contexts)
        dataset_dict[party]["answer"].append(answer)

dataset = DatasetDict(
    {party: Dataset.from_dict(dataset_dict[party]) for party in parties}
)

  0%|          | 0/33 [00:00<?, ?it/s]

In [28]:
dataset.save_to_disk("dataset")

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/33 [00:00<?, ? examples/s]

In [30]:
from ragas.metrics import (
    faithfulness,
    answer_correctness,
)

from ragas import adapt

# llm used for adaptation
openai_model_adaption = ChatOpenAI(model_name="gpt-4")
adapt(metrics=[faithfulness], language="german", llm=openai_model_adaption)


print(answer_correctness.correctness_prompt.to_string())

Extract following from given question and ground truth
            "TP": statements that are present in both the answer and the ground truth,
            "FP": statements present in the answer but not found in the ground truth,
            "FN": relevant statements found in the ground truth but omitted in the answer, 
        
Output in only valid JSON format.

question: "What powers the sun and what is its primary function?"
answer: "The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system."
ground_truth: "The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents."
Extracted statements: {{"TP": ["The

In [32]:
from ragas import evaluate

result_dict = {}
for party in parties:
    print(f"Evaluating party: {party}")
    ragas_score = evaluate(dataset[party], metrics=[faithfulness])
    result_dict.update({party: ragas_score})

Evaluating party: afd


Evaluating:   0%|          | 0/33 [00:00<?, ?it/s]

Evaluating party: spd


Evaluating:   0%|          | 0/33 [00:00<?, ?it/s]

Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Runner(Thread-8, stopped 10927239168)>>
Traceback (most recent call last):
  File "/Users/christianliedl/anaconda3/envs/bundestag/lib/python3.11/threading.py", line 1002, in _bootstrap
    self._bootstrap_inner()
  File "/Users/christianliedl/anaconda3/envs/bundestag/lib/python3.11/threading.py", line 1047, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/Users/christianliedl/anaconda3/envs/bundestag/lib/python3.11/threading.py", line 1359, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/Users/christianliedl/anaconda3/envs/bundestag/lib/python3.11/site-packages/ipykernel/iostream.py", line 604, in flush
    self.pub_thread.schedule(self._flush)
  File "/Users/christianliedl/anaconda3/envs/bundestag/lib/python3.11/site-packages/ipykernel/iostream.py", line 267, in schedule
    self._event_pipe.send(b"")
  Fil

KeyboardInterrupt: 

In [33]:
result_dict

{'afd': {'faithfulness': 0.6667}}

# Load test data

# Create Eval chain

In [7]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
)
from ragas.langchain import RagasEvaluatorChain

# create eval chains
eval_chains = {m.name: RagasEvaluatorChain(
    metric=m) for m in [context_relevancy]}

# Run eval chain for test data

In [57]:
metrics = {}
for party in sources:
    for name, _ in eval_chains.items():
        metrics.update({f"{name}_score_{party}": []})

sources = ["gruene", "spd", "cdu", "afd", "fdp", "linke"]

for question in df_test_simple["EINFACHE FRAGEN"]:
    print("_____________________________________")
    print(question)
    # Rename keys to be compatible with eval_chain
    output = {}
    output["query"] = question

    for party in sources:
        output["source_documents"] = db_manifestos.database.similarity_search(
            query=question, k=5, filter={"party": party}
        )
        output["result"] = ""

        # Evaluate one metric at a time
        for name, eval_chain in eval_chains.items():
            score_name = f"{name}_score"
            scores = eval_chain(output)[score_name]
            print(f"{score_name}_{party}: {scores}")
            metrics[f"{score_name}_{party}"].append(scores)

_____________________________________
Was denken die Parteien darüber, mehr Geld für die Armee auszugeben, um Europa sicher zu halten?
context_relevancy_score: 0.0
context_relevancy_score: 0.05128205128205128
context_relevancy_score: 0.11458333333333333
context_relevancy_score: 0.03278688524590164
context_relevancy_score: 0.0
context_relevancy_score: 0.0
_____________________________________
Was sagen die Parteien, wie wir Transgender-Menschen helfen und dafür sorgen können, dass alle fair behandelt werden?
context_relevancy_score: 0.11494252873563218
context_relevancy_score: 0.10256410256410256
context_relevancy_score: 0.0
context_relevancy_score: 0.0
context_relevancy_score: 0.12903225806451613
context_relevancy_score: 0.2222222222222222
_____________________________________
Sollen wir in Europa mehr für den Schutz unserer Umwelt tun?
context_relevancy_score: 0.06796116504854369


KeyboardInterrupt: 

In [12]:
from openai import OpenAI

client = OpenAI()

for question in df_test_simple["EINFACHE FRAGEN"].iloc[:1]:

    sources = ["gruene", "spd", "cdu", "afd", "fdp", "linke"]

    print("_____________________________________")
    print(question)
    # Rename keys to be compatible with eval_chain
    output = {}
    output["query"] = question

    for party in sources:
        output["source_documents"] = db_manifestos.database.similarity_search(
            query=question, k=5, filter={"party": party}
        )

        instruction = """Du hilfst mir bei der Evaluation eines RAG systems. Bewerte, ob die folgenden Dokumente relevant sind für die Frage. Antworte mit einer List, in der für jedes Dokument entweder 0 (nicht relevant) oder 1 (relevant) ausgegeben wird.
        Beispiel: [0, 1, 1, 1, 0]"""

        context = ""
        for i, doc in enumerate(output["source_documents"]):
            context += f"""Dokument {i+1}: {doc.page_content}\n\n\n
            """
        prompt = f"{instruction}\n\n\nHier sind die Dokumente:\n{context}\n\n\nHier ist die Frage:{question}"

        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            temperature=0,
            messages=[
                {"role": "system", "content": prompt},
            ],
        )

        print(completion.choices[0].message)

_____________________________________
Was denken die Parteien darüber, mehr Geld für die Armee auszugeben, um Europa sicher zu halten?
ChatCompletionMessage(content='1: 1\n2: 1\n3: 0\n4: 0\n5: 1', role='assistant', function_call=None, tool_calls=None)
ChatCompletionMessage(content='1, 1, 0, 1, 0', role='assistant', function_call=None, tool_calls=None)
ChatCompletionMessage(content='1, 1, 1, 0, 1', role='assistant', function_call=None, tool_calls=None)
ChatCompletionMessage(content='1, 0, 1, 1, 0', role='assistant', function_call=None, tool_calls=None)
ChatCompletionMessage(content='1, 2, 3, 4, 5', role='assistant', function_call=None, tool_calls=None)
ChatCompletionMessage(content='1, 1, 1, 1, 0', role='assistant', function_call=None, tool_calls=None)


ChatCompletionMessage(content='1, 1, 1, 1, 1', role='assistant', function_call=None, tool_calls=None)


In [37]:
df_metrics = pd.DataFrame(metrics)
df_metrics.loc[:, "question"] = df_test_simple["EINFACHE FRAGEN"]
df_metrics.to_csv("metrics.csv")

In [41]:
df_metrics = pd.read_csv("metrics.csv")

{key + "_mean": np.mean(df_metrics[key]) for key in metrics.keys()}

{'faithfulness_mean': 0.6958333333333333,
 'answer_relevancy_mean': 0.8132587018852587,
 'context_relevancy_mean': 0.08362864874686092}