In [1]:
import sys
import os
import glob
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import plotly.express as px
import datetime

sys.path.append(os.path.abspath(".."))

DATABASE_DIR_MANIFESTOS = "../../data/manifestos/chroma/openai"
DATABASE_DIR_DEBATES = "../../data/debates/chroma/openai"
TEST_DATA_DIR = "../../data/questions"
TEST_METRICS_DIR = "../../data/metrics"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load test questions
df_test_simple = pd.read_csv(os.path.join(
    TEST_DATA_DIR, "simple_questions.csv"))

df_test_complex = pd.read_csv(os.path.join(
    TEST_DATA_DIR, "complex_questions.csv"))

In [3]:
from database.vector_database import VectorDatabase
from models.embedding import ManifestoBertaEmbeddings
from langchain_openai import OpenAIEmbeddings
from evaluation import Evaluator

In [4]:
# Load databases

embedding_model = OpenAIEmbeddings()

db_manifestos = VectorDatabase(
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_MANIFESTOS,
    source_type="manifestos",
)

db_debates = VectorDatabase(
    embedding_model=embedding_model,
    database_directory=DATABASE_DIR_DEBATES,
    source_type="debates",
)

reloaded database
reloaded database


# Evaluate context relevancy

In [6]:
evaluator = Evaluator()
model_name = "manifestoberta"
questions = df_test_simple["EINFACHE FRAGEN"]
k = 5
sources = ["gruene", "spd", "cdu", "afd", "fdp", "linke"]

for db in [db_manifestos, db_debates]:

    context_relevancy_dict = {key: [] for key in sources}

    # Loop over all simple questions
    for question in tqdm(questions):
        # Retrieve and evaluate context for each party
        for party in sources:
            context_docs = db.database.similarity_search(
                query=question, k=5, filter={"party": party}
            )

            result = evaluator.context_relevancy(question, context_docs)
            context_relevancy_dict[party].append(result)

    # Save results
    context_relevancy_df = pd.DataFrame(context_relevancy_dict)
    context_relevancy_df["Frage"] = list(questions)

    date = str(datetime.date.today())
    save_folder = os.path.join(TEST_METRICS_DIR, f"{date}_{model_name}")
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    context_relevancy_df.to_csv(
        os.path.join(save_folder, f"context_relevancy_{db.source_type}.csv")
    )

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [9]:
context_relevancy_df.to_csv(os.path.join(save_folder, f"context_relevancy_{db.source_type}.csv"))

In [52]:
import glob

for file in os.listdir(save_folder):
    df = pd.read_csv(os.path.join(save_folder, file))
    df = df.drop(columns=["Frage", "Unnamed: 0"]).mean(axis=0)
    print(file.split("_")[-1])
    print(f"Overall: {df.mean()}")
    print(df)
    print()

manifestos.csv
Overall: 0.7545454545454545
gruene    0.809091
spd       0.836364
cdu       0.663636
afd       0.663636
fdp       0.709091
linke     0.845455
dtype: float64

debates.csv
Overall: 0.7560606060606062
gruene    0.800000
spd       0.827273
cdu       0.654545
afd       0.690909
fdp       0.718182
linke     0.845455
dtype: float64



'debates.csv'

In [36]:
df

Unnamed: 0    10.500000
gruene         0.800000
spd            0.827273
cdu            0.654545
afd            0.690909
fdp            0.718182
linke          0.845455
dtype: float64