In [38]:
import sys
import os
import numpy as np
import pandas as pd
sys.path.append(os.path.abspath('..'))

DATABASE_DIR = "../../data/manifestos/chroma"
TEST_DATA_DIR = "../../data/questions"

# Create simple chain

In [23]:
# Here we import our modules
from models.embedding import ManifestoBertaEmbeddings
from database.vector_database import VectorDatabaseCreator
from models.retrieval import retriever
from models.generation import generate_chain

# get embedding module
embedding_model = ManifestoBertaEmbeddings()

# get database_creator
database_creator = VectorDatabaseCreator(
    embedding_model=embedding_model,
    db_directory=DATABASE_DIR)

# load database
database = database_creator.load_database()

# get retriever
retriever = retriever(db=database,
                      search_type="similarity",
                      k=10)

# generate langchain
chain, chain_with_source = generate_chain(
    retriever=retriever, output_parser="str")

Some weights of XLMRobertaModel were not initialized from the model checkpoint at manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load test data

In [24]:
import pandas as pd

df_test_simple = pd.read_csv(os.path.join(
    TEST_DATA_DIR, "simple_questions.csv"))

df_test_complex = pd.read_csv(os.path.join(
    TEST_DATA_DIR, "complex_questions.csv"))

# Create Eval chain

In [25]:
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall
from ragas.langchain import RagasEvaluatorChain

# create eval chains
eval_chains = {
    m.name: RagasEvaluatorChain(metric=m)
    for m in [faithfulness, answer_relevancy, context_relevancy]
}

# Run eval chain for test data

In [27]:
metrics = {key: [] for key in list(eval_chains.keys())}

for question in df_test_simple["EINFACHE FRAGEN"]:
    print("_____________________________________")
    print(question)
    output = chain_with_source.invoke(question)

    # Rename keys to be compatible with eval_chain
    output["query"] = output.pop("question")
    output["result"] = output.pop("answer")
    output["source_documents"] = output.pop("context")

    # Evaluate one metric at a time
    for name, eval_chain in eval_chains.items():
        score_name = f"{name}_score"
        print(f"{score_name}: {eval_chain(output)[score_name]}")
        metrics[name].append(eval_chain(output)[score_name])

_____________________________________
Was denken die Parteien darüber, mehr Geld für die Armee auszugeben, um Europa sicher zu halten?
faithfulness_score: 0.75
answer_relevancy_score: 0.8481909739343152
context_relevancy_score: 0.25882352941176473
_____________________________________
Was sagen die Parteien, wie wir Transgender-Menschen helfen und dafür sorgen können, dass alle fair behandelt werden?
faithfulness_score: 0.6666666666666666
answer_relevancy_score: 0.8598844241881297
context_relevancy_score: 0.04716981132075472
_____________________________________
Sollen wir in Europa mehr für den Schutz unserer Umwelt tun?
faithfulness_score: 1.0
answer_relevancy_score: 0.8885471136150095
context_relevancy_score: 0.08823529411764706
_____________________________________
Wie wollen die Parteien den Bauern helfen, so zu arbeiten, dass es gut für die Natur ist und sie trotzdem Geld verdienen können?
faithfulness_score: 0.6666666666666666
answer_relevancy_score: 0.8544237070459714
context_r

In [37]:
df_metrics = pd.DataFrame(metrics)
df_metrics.loc[:, "question"] = df_test_simple["EINFACHE FRAGEN"]
df_metrics.to_csv("metrics.csv")

In [41]:
df_metrics = pd.read_csv("metrics.csv")

{key+"_mean": np.mean(df_metrics[key]) for key in metrics.keys()}

{'faithfulness_mean': 0.6958333333333333,
 'answer_relevancy_mean': 0.8132587018852587,
 'context_relevancy_mean': 0.08362864874686092}