In [31]:
import pandas as pd
import ast
import os

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase

In [32]:
with open("keys/openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = openai_key

In [33]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Given the correct context and the actual answer. Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    verbose_mode=False,
)

# Vanilla RAG

In [34]:
pd.set_option("display.max_colwidth", None)

In [35]:
results = pd.read_csv('vanilla.csv', index_col=0)
results["Correct Documents"] = results["Correct Documents"].apply(lambda x : x.split("-")[0])

In [36]:
correct = 0
for idx in results.index:
    if results.loc[idx, "Correct Documents"] == results.loc[idx, "Retrieved Context"]:
        correct += 1
    else:
        print(results.loc[idx, "Correct Documents"], results.loc[idx, "Retrieved Context"])

print()
print(correct / len(results["Retrieved Context"]))

AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_89.pdf AAL/2014/page_201.pdf
AAL/2014/page_92.pdf AAL/2014/page_130.pdf
AAL/2014/page_18.pdf AAL/2014/page_84.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_92.pdf AAL/2014/page_201.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_92.pdf AAL/2014/page_83.pdf
AAL/2014/page_18.pdf AAL/2014/page_73.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf

0.32


In [None]:
for idx, row in results.iterrows():
    
    input_text = row["Golden Context"]
    actual_output = row["Generated Answer"]
    expected_output = row["Correct Answer"]

    test_case = LLMTestCase(
        input=input_text,
        actual_output=actual_output,
        expected_output=expected_output,
    )
    
    correctness_metric.measure(test_case)

    results.loc[idx, "Correctness Score"] = correctness_metric.score
    results.loc[idx, "Correctness Reasoning"] = correctness_metric.reason

In [None]:
results.to_csv("eval_vanilla.csv")

# Colpali RAG

In [19]:
results = pd.read_csv("colpali.csv", index_col=0)
results["Correct Documents"] = results["Correct Documents"].apply(lambda x : x.split("-")[0])

In [20]:
correct = 0
for idx in results.index:
    if results.loc[idx, "Correct Documents"] == results.loc[idx, "Retrieved Context"]:
        correct += 1
    else:
        print(results.loc[idx, "Correct Documents"], results.loc[idx, "Retrieved Context"])

print()
print(correct / len(results["Retrieved Context"]))

AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_18.pdf AAL/2014/page_84.pdf
AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_92.pdf AAL/2014/page_130.pdf
AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_92.pdf AAL/2014/page_83.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf

0.68


In [None]:
for idx, row in results.iterrows():
    
    input_text = row["Golden Context"]
    actual_output = row["Generated Answer"]
    expected_output = row["Correct Answer"]

    test_case = LLMTestCase(
        input=input_text,
        actual_output=actual_output,
        expected_output=expected_output,
    )
    
    correctness_metric.measure(test_case)

    results.loc[idx, "Correctness Score"] = correctness_metric.score
    results.loc[idx, "Correctness Reasoning"] = correctness_metric.reason

In [None]:
results.to_csv("eval_colpali.csv")

# Hybrid RAG

In [21]:
import pandas as pd

In [22]:
vanilla = pd.read_csv("eval_vanilla.csv", index_col=0)

In [23]:
colpali = pd.read_csv("eval_colpali.csv", index_col=0)

In [24]:
vanilla["Correctness Score"].mean()

0.5483227519398131

In [25]:
colpali["Correctness Score"].mean()

0.6039156988492254