In [1]:
import pandas as pd
import ast
import os

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase

In [2]:
with open("openai_api_key.txt", "r") as file:
    openai_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = openai_key

In [3]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Given the correct context and the actual answer. Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

# Vanilla RAG

In [4]:
pd.set_option("display.max_colwidth", None)

In [15]:
results = pd.read_csv('vanilla.csv', index_col=0)
results["Correct Documents"] = results["Correct Documents"].apply(lambda x : x.split("-")[0])
results

Unnamed: 0,Retrieved Context,Correct Documents,Generated Answer,Correct Answer,Golden Context
15,AAL/2014/page_130.pdf,AAL/2014/page_89.pdf,"Approximately 66% \n\nTo arrive at this answer, I looked at the total reorganization items, net for both American Airlines Group Inc. and American Airlines, Inc. for the years ended December 31, 2013. I then calculated the total amount of the labor-related deemed claim for each company and divided it by the total reorganization items, net to find the percentage. For American Airlines Group Inc., the labor-related deemed claim was $1,733 million out of a total of $2,640 million, which is approximately 66%.",65.6%,['the labor-related deemed claim ( 1 ) of 2013 is $ 1733 ;'\n 'the total reorganization items net of 2013 is $ 2640 ;']
105,AAL/2014/page_202.pdf,AAL/2014/page_80.pdf,"Approximately 66% \n\nTo arrive at this answer, I first identified the labor-related deemed claim amount, which was $1,733 million. Then, I found the total reorganization items, which was $2,640 million. I divided the labor-related deemed claim by the total reorganization items and multiplied by 100 to get the percentage, which is approximately 66%.",65.3%,['the labor-related deemed claim ( 1 ) of 2013 is $ 1733 ;'\n 'the total reorganization items net of 2013 is $ 2655 ;']
161,AAL/2014/page_201.pdf,AAL/2014/page_89.pdf,"Answer: 7.5%\n\nTo arrive at this answer, I looked at the total reorganization items, net for American Airlines, Inc., which was $2,640 million in 2013. Then I found the amount of professional fees listed in the table for that year, which was $199 million. I divided $199 million by $2,640 million and multiplied by 100 to get the percentage, which is 7.5%.",7.5%,['the professional fees of 2013 is 199 ;'\n 'the total reorganization items net of 2013 is $ 2640 ;']
266,AAL/2014/page_18.pdf,AAL/2014/page_18.pdf,"$25,760 million\n\nTo arrive at this answer, I looked at the ""Operating expenses"" section of the ""AMERICAN AIRLINES GROUP INC. CONSOLIDATED STATEMENTS OF OPERATIONS"" table for the year 2013. The total mainline operating expenses for 2013 were $25,760 million.",31381,['year the 2013 ( a ) of gallons is 3608 ; the 2013 ( a ) of average price per gallon is 3.08 ; the 2013 ( a ) of aircraft fuel expense is 11109 ; the 2013 ( a ) of percent of total mainline operating expenses is 35.4 ;']
908,AAL/2014/page_59.pdf,AAL/2014/page_59.pdf,"Answer: 52% \n\nExplanation: To find the growth rate on the AMEX Airline Index from 12/31/2013 to 12/31/2014, we compare the values provided in the table. \n- The value for the AMEX Airline Index on 12/31/2013 was $100. \n- The value for the AMEX Airline Index on 12/31/2014 was $152. \n\nTo calculate the growth rate:\n- Subtract the initial value from the final value: $152 - $100 = $52\n- Divide the difference by the initial value: $52 / $100 = 0.52 or 52% \n\nTherefore, the growth rate on the AMEX Airline Index from 12/31/2013 to 12/31/2014 was 52%.",49%,['the amex airline index of 12/9/2013 is 100 ; the amex airline index of 12/31/2013 is 102 ; the amex airline index of 12/31/2014 is 152 ;'\n 'the s&p 500 of 12/9/2013 is 100 ; the s&p 500 of 12/31/2013 is 102 ; the s&p 500 of 12/31/2014 is 114 ;']
1164,AAL/2014/page_130.pdf,AAL/2014/page_92.pdf,"Decreased by $1,626 million.\n\nSteps:\n1. Look at the table of reorganization items, net for American Airlines Group Inc. for the years 2013 and 2012.\n2. Find the line item for ""Aircraft and facility financing renegotiations and rejections"" for both years.\n3. Calculate the difference between the amounts for 2012 and 2013 to determine the decrease. \n $1,950 million (2012) - $325 million (2013) = $1,626 million",-83.6%,['the aircraft and facility financing renegotiations and rejections ( 2 ) ( 3 ) of 2013 is 320 ; the aircraft and facility financing renegotiations and rejections ( 2 ) ( 3 ) of 2012 is 1951 ;']
1696,AAL/2014/page_84.pdf,AAL/2014/page_18.pdf,"$10,592 million\n\nSteps taken:\n1. Look at the table provided for American Airlines, Inc. consolidated statements of operations for the year ended December 31, 2014.\n2. Find the line item ""Total operating expenses"" under ""Operating expenses"" section.\n3. The total mainline operating expenses in 2014 was $10,592 million.",31903.6,['year the 2014 of gallons is 3644 ; the 2014 of average price per gallon is $ 2.91 ; the 2014 of aircraft fuel expense is $ 10592 ; the 2014 of percent of total mainline operating expenses is 33.2% ( 33.2 % ) ;']
1753,AAL/2014/page_219.pdf,AAL/2014/page_219.pdf,"$5 million\n\nSteps taken to arrive at the answer:\n1. Look at the section titled ""NOTES TO CONSOLIDATED FINANCIAL STATEMENTS OF AMERICAN AIRLINES, INC.""\n2. Find the section discussing the unrecognized tax benefit at December 31, 2013.\n3. Note that the unrecognized tax benefit at December 31, 2013 was $5 million.",$ 5,['the unrecognized tax benefit at january 1 of 2014 is $ 5 ; the unrecognized tax benefit at january 1 of 2013 is $ 5 ;']
2065,AAL/2014/page_130.pdf,AAL/2014/page_89.pdf,"Approximately 12%. \n\nTo arrive at this answer, I first noted that the total reorganization items net for American Airlines Group Inc. in 2013 was $2,640 million and that the amount related to aircraft and facility financing renegotiations and rejections was $320 million. I then divided $320 million by $2,640 million and multiplied the result by 100 to find the percentage, which is approximately 12%.",12.1%,['the aircraft and facility financing renegotiations and rejections ( 2 ) ( 3 ) of 2013 is 320 ;'\n 'the total reorganization items net of 2013 is $ 2640 ;']
2088,AAL/2014/page_92.pdf,AAL/2014/page_92.pdf,"$451 million\n\nExplanation: \nTo find the increase in total reorganization items net from 2012 to 2013, we need to subtract the total for 2012 from the total for 2013. \nTotal reorganization items net for 2013: $2,640 million \nTotal reorganization items net for 2012: $2,179 million \n$2,640 million - $2,179 million = $461 million \nTherefore, the total reorganization items net increased by $461 million from 2012 to 2013.",21.2%,['the total reorganization items net of 2013 is $ 2640 ; the total reorganization items net of 2012 is $ 2179 ;']


In [16]:
correct = 0
for idx in results.index:
    if results.loc[idx, "Correct Documents"] == results.loc[idx, "Retrieved Context"]:
        correct += 1
    else:
        print(results.loc[idx, "Correct Documents"], results.loc[idx, "Retrieved Context"])

print(correct / len(results["Retrieved Context"]))

AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_202.pdf
AAL/2014/page_89.pdf AAL/2014/page_201.pdf
AAL/2014/page_92.pdf AAL/2014/page_130.pdf
AAL/2014/page_18.pdf AAL/2014/page_84.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_92.pdf AAL/2014/page_201.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_80.pdf AAL/2014/page_202.pdf
AAL/2014/page_92.pdf AAL/2014/page_83.pdf
AAL/2014/page_18.pdf AAL/2014/page_73.pdf
AAL/2014/page_80.pdf AAL/2014/page_130.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf
0.44


In [7]:
generated = results["Generated Answer"]
expected = results["Correct Answer"]
golden_context = results["Golden Context"]

print(golden_context.head(1))
print(expected.head(1))
print(generated.head(1))


15    ['the labor-related deemed claim ( 1 ) of 2013 is $ 1733 ;'\n 'the total reorganization items net of 2013 is $ 2640 ;']
Name: Golden Context, dtype: object
15    65.6%
Name: Correct Answer, dtype: object
15    Approximately 66% \n\nTo arrive at this answer, I looked at the total reorganization items, net for both American Airlines Group Inc. and American Airlines, Inc. for the years ended December 31, 2013. I then calculated the total amount of the labor-related deemed claim for each company and divided it by the total reorganization items, net to find the percentage. For American Airlines Group Inc., the labor-related deemed claim was $1,733 million out of a total of $2,640 million, which is approximately 66%.
Name: Generated Answer, dtype: object


In [8]:
test_case = LLMTestCase(
    input=golden_context.iloc[3],
    actual_output=generated.iloc[3],
    expected_output=expected.iloc[3],
)

correctness_metric.measure(test_case)

Output()

In [9]:
print(correctness_metric.score)
print(correctness_metric.reason)

0.11023543634060554
The actual output provides a total operating expense figure that is not mentioned in the input, which focuses on fuel expense details.


# Colpali RAG

In [10]:
results = pd.read_csv("colpali.csv", index_col=0)
results["Correct Documents"] = results["Correct Documents"].apply(lambda x : x.split("-")[0])

In [11]:
correct = 0
for idx in results.index:
    if results.loc[idx, "Correct Documents"] == results.loc[idx, "Retrieved Context"]:
        correct += 1
    else:
        print(results.loc[idx, "Correct Documents"], results.loc[idx, "Retrieved Context"])

print(correct / len(results["Retrieved Context"]))

AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_89.pdf AAL/2014/page_130.pdf
AAL/2014/page_18.pdf AAL/2014/page_84.pdf
AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_92.pdf AAL/2014/page_130.pdf
AAL/2014/page_89.pdf AAL/2014/page_80.pdf
AAL/2014/page_92.pdf AAL/2014/page_83.pdf
AAL/2014/page_15.pdf AAL/2014/page_16.pdf
0.68


In [12]:
generated = results["Generated Answer"]
expected = results["Correct Answer"]
golden_context = results["Golden Context"]

print(golden_context.head(1))
print(expected.head(1))
print(generated.head(1))

15    ['the labor-related deemed claim ( 1 ) of 2013 is $ 1733 ;'\n 'the total reorganization items net of 2013 is $ 2640 ;']
Name: Golden Context, dtype: object
15    65.6%
Name: Correct Answer, dtype: object
15    65%\n\n**Steps:**\n\n1. Locate the section titled "Reorganization Items, Net" in the table.\n2. Identify the "Labor-related deemed claim" amount: $1,733 million.\n3. Find the "Total reorganization items, net": $2,655 million.\n4. Calculate the percentage: \((1,733 / 2,655) \times 100 = 65.27\%\).\n5. Round to two significant figures: 65%.
Name: Generated Answer, dtype: object


In [13]:
test_case = LLMTestCase(
    input=golden_context.iloc[3],
    actual_output=generated.iloc[3],
    expected_output=expected.iloc[3],
)

correctness_metric.measure(test_case)

Output()

In [14]:
print(correctness_metric.score)
print(correctness_metric.reason)

0.01937513503406006
The actual output of $282 million does not align with any context or figures provided in the input.


# Hybrid RAG