In [23]:
import sys
import os

sys.path.append("..")

DATABASE_DIR_MANIFESTOS = "../../data/manifestos/chroma/jina-embeddings-v2-base-de"
DATABASE_DIR_DEBATES = "../../data/debates/chroma/jina-embeddings-v2-base-de"
TEST_DATA_PATH = "../../data/questions/eval_questions.csv"

# Here we define the experimental run! This is important for the filenames, etc!

In [24]:
experiment_name = "4_embedding_model"
experiment_run_name = "4d_jina-embeddings-v2-base-de"

In [25]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm


from models.RAG import RAG
from database.vector_database import VectorDatabase
from models.embedding import (
    ManifestoBertaEmbeddings,
    SentenceTransformerEmbedding,
    JinaAIEmbedding,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI


from datasets import Dataset, DatasetDict, load_from_disk

In [26]:
def format_response(response):
    q = response[next(iter(response))]["question"]
    c = {party: response[party]["context"] for party in response.keys()}
    d = {
        source: {party: response[party]["docs"][source]
                 for party in response.keys()}
        for source in response[list(response.keys())[0]]["docs"].keys()
    }
    a = {party: response[party]["answer"] for party in response.keys()}
    response = {"question": q, "context": c, "docs": d, "answer": a}
    return response

# Create chain

In [27]:
from models.generation import generate_chain

# Select an embedding_model
########################################################################
# embedding_model = ManifestoBertaEmbeddings()
# embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
# embedding_model = SentenceTransformerEmbedding("multi-qa-mpnet-base-dot-v1")
embedding_model = JinaAIEmbedding()

# Select an LLM
#########################################################################
LARGE_LANGUAGE_MODEL = ChatOpenAI(
    model_name="gpt-3.5-turbo", max_tokens=300, temperature=0
)

# LARGE_LANGUAGE_MODEL = ChatMistralAI(
#     name="open-mixtral-8x7b", max_tokens=300, temperature=0
# )

# LARGE_LANGUAGE_MODEL = ChatAnthropic(
#     model_name="claude-3-haiku-20240307", max_tokens=300, temperature=0
# )


db_manifestos = VectorDatabase(
    data_path="../data/manifestos",
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_MANIFESTOS,
    source_type="manifestos",
)

db_debates = VectorDatabase(
    data_path="../data/debates",
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_DEBATES,
    source_type="debates",
)

chain = generate_chain(
    [db_manifestos, db_debates],
    llm=LARGE_LANGUAGE_MODEL,
    language="Deutsch",
    k=3,
)


# rag = RAG(
#     [db_manifestos, db_debates], llm=LARGE_LANGUAGE_MODEL, k=3, language="Deutsch"
# )

reloaded database
reloaded database


# Create dataset with question, context, and answer for chain

In [28]:
recreate_dataset = True
if recreate_dataset:
    # Create dataset with question, context and answer

    # Load all test questions
    # df_test_simple = pd.read_csv(os.path.join(TEST_DATA_DIR, "simple_questions.csv"))
    # df_test_complex = pd.read_csv(os.path.join(TEST_DATA_DIR, "complex_questions.csv"))
    df_test = pd.read_csv(TEST_DATA_PATH)

    # Here we select, which parties we want to evaluate
    parties = ["afd", "spd", "linke", "gruene", "fdp", "cdu"]

    # Here we select the questions that should be evaluated
    questions = df_test["question"]

    # Dictionary to save results
    dataset_dict = {
        party: {"question": [], "contexts": [], "answer": []} for party in parties
    }

    # Loop through all questions and get chain answer
    for question in tqdm(questions):
        print(question)
        response = chain.invoke(question)
        response = format_response(response)

        # for pary in parties:
        for party in parties:
            contexts = [
                doc.page_content for doc in response["docs"]["manifestos"][party]
            ]
            answer = response["answer"][party]

            dataset_dict[party]["question"].append(question)
            dataset_dict[party]["contexts"].append(contexts)
            dataset_dict[party]["answer"].append(answer)

    dataset = DatasetDict(
        {party: Dataset.from_dict(dataset_dict[party]) for party in parties}
    )

    # Save
    dataset.save_to_disk(
        f"Experiments/{experiment_name}/dataset_{experiment_run_name}")

  0%|          | 0/24 [00:00<?, ?it/s]

Sollen die Militärausgaben erhöht werden?
Sollte es ein Selbstbestimmungsrecht für Trans-Personen geben?
Sollen wir in Europa mehr für den Natur- und Artenschutz tun?
Wie soll die Landwirtschaft unterstützt werden? 
Wie soll Europa Handel mit anderen Ländern treiben? (z.B. Handelsabkommen)
Wie sollen Flüchtlinge in Europa verteilt werden? 
Wie stehen die Parteien zur gleichgeschlechtlichen Ehe?
Wie sollten wir Fischfang betreiben damit das Meer und die Tiere geschützt werden?
Was wollen die Parteien tun um das Tierwohl in der Viehhaltung zu verbessern? 
Wie sollte die Zusammenarbeit mit den USA aussehen?
Wie können wir Jungendarbeitslosigkeit reduzieren? 
Wie sehen die Pläne zum Erreichen der Klimaneutralität aus?
Wie soll das Verhältnis zu China aussehen im Bezug auf Protektionismus?
Wie sollen Kameras und Technik dabei helfen Verbrechen zu stoppen ohne dass unsere Privatsphäre gestört wird?
Sollten mehr Länder der EU beitreten?
Wie sollen europäische high-tech Innovationen gefördert 

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

In [29]:
dataset = load_from_disk(f"Experiments/{experiment_name}/dataset_{experiment_run_name}")

# Import Ragas and translate prompts to German

In [30]:
import ragas
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy
from evaluation import Evaluator

# llm used for adaptation
openai_model_adaption = ChatOpenAI(model_name="gpt-4")
ragas.adapt(
    metrics=[faithfulness, answer_relevancy, context_relevancy],
    language="german",
    llm=openai_model_adaption,
)

evaluator = Evaluator()

# Compute metrics with ragas

In [31]:
from ragas.run_config import RunConfig

# Configuration for ragas.evaluate
run_config = RunConfig()
run_config.max_retries = 3
run_config.max_timeout = 60  # in s
run_config.max_wait = 20  # in s

num_questions = len(dataset["afd"])
questions = [dataset["fdp"][i]["question"] for i in range(num_questions)]

results_dict = {}
# Define which parties should be evaluated
parties = ["afd", "spd", "linke", "gruene", "fdp", "cdu"]

# Define which metrics should be evaluated
list_of_metrics = [faithfulness, answer_relevancy]

for i in tqdm(range(num_questions)):
    results_dict.update({questions[i]: {}})
    print(f"Question: {questions[i]}")
    for party in parties:
        # print(f"Evaluating party: {party}")
        # ragas_score = ragas.evaluate(
        #     dataset[party].select([i]), metrics=list_of_metrics, run_config=run_config
        # )
        ragas_score = evaluator.context_relevancy(dataset[party].select([i]))

        results_dict[questions[i]].update({party: ragas_score})
    print(results_dict[questions[i]])

  0%|          | 0/24 [00:00<?, ?it/s]

Question: Sollen die Militärausgaben erhöht werden?
Exception occurred
Exception occurred
Exception occurred
{'afd': {'context_relevancy': 0.6666666666666666}, 'spd': {'context_relevancy': 0.0}, 'linke': {'context_relevancy': None}, 'gruene': {'context_relevancy': 0.6666666666666666}, 'fdp': {'context_relevancy': None}, 'cdu': {'context_relevancy': None}}
Question: Sollte es ein Selbstbestimmungsrecht für Trans-Personen geben?
Exception occurred
{'afd': {'context_relevancy': 0.0}, 'spd': {'context_relevancy': 0.6666666666666666}, 'linke': {'context_relevancy': 0.6666666666666666}, 'gruene': {'context_relevancy': 0.6666666666666666}, 'fdp': {'context_relevancy': 0.3333333333333333}, 'cdu': {'context_relevancy': None}}
Question: Sollen wir in Europa mehr für den Natur- und Artenschutz tun?
{'afd': {'context_relevancy': 0.0}, 'spd': {'context_relevancy': 0.6666666666666666}, 'linke': {'context_relevancy': 0.6666666666666666}, 'gruene': {'context_relevancy': 1.0}, 'fdp': {'context_relevanc

In [32]:
# Flatten the dictionary
flattened_data = {}
for question, parties in results_dict.items():
    for party, metrics in parties.items():
        for metric, value in metrics.items():
            flattened_data[(question, party, metric)] = value

# Create a multi-index DataFrame
index = pd.MultiIndex.from_tuples(flattened_data.keys())
df = pd.Series(flattened_data, index=index).unstack().unstack()

file_name = f"Experiments/{experiment_name}/metrics_{experiment_run_name}.csv"

df.to_csv(file_name)

In [33]:
# Check if file saved correctly
df = pd.read_csv(file_name, header=[0, 1], index_col=[0, 1])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,context_relevancy,context_relevancy,context_relevancy,context_relevancy,context_relevancy
Unnamed: 0_level_1,Unnamed: 1_level_1,cdu,fdp,gruene,linke,spd
Soll Atomkraft weiterhin genutzt werden?,0.333333,0.666667,0.333333,0.333333,,0.666667
Sollen Geflüchtete ausgewiesen werden wenn sie sich strafbar machen?,0.333333,0.0,0.666667,0.0,0.666667,0.333333
Sollen die Militärausgaben erhöht werden?,0.666667,,,0.666667,,0.0
Sollen wir in Europa mehr für den Natur- und Artenschutz tun?,0.0,0.666667,0.666667,1.0,0.666667,0.666667
Sollte es ein Selbstbestimmungsrecht für Trans-Personen geben?,0.0,,0.333333,0.666667,0.666667,0.666667
