In [55]:
import json
import os
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import precision_score, recall_score, f1_score
from typing import List, Dict, Any
from openai import OpenAI
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from copy import deepcopy

### 1. Configure Environment
---

In [2]:
# Set pandas display options for better readability
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

In [3]:
# Load environment variables from .env file
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [4]:
# Validate the API key format
if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:
    print("API key looks good so far")
else:
    print("Issue detected with API key")

API key looks good so far


In [5]:
# Initialize OpenAI client
openai = OpenAI()
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

### 2. Processing Functions
---

In [6]:
def table_to_string(table):
    """Convert table to string representation."""
    return "\n".join(" | ".join(str(cell) for cell in row) for row in table)

In [None]:
def process_json_entry(entry):
    """Process a single JSON entry to extract relevant fields. Returns None if the qa field is an empty dictionary"""

    # Remove entries where the qa field is empty
    if not entry.get("qa") or entry["qa"] == {}:
        return None
    
    entry_id = entry.get("id", None)
    parts = []

    # Extract and merge pre-text
    if "pre_text" in entry and entry["pre_text"]:
        parts.append(" ".join(entry["pre_text"]))
    
    # Convert table to string
    table_str = table_to_string(entry["table"]) if "table" in entry and entry["table"] else ""
    if table_str:
        parts.append(table_str)
    
    # Extract and merge post-text
    if "post_text" in entry and entry["post_text"]:
        parts.append(" ".join(entry["post_text"]))
    
    # Extract and merge QA details
    if "qa" in entry and entry["qa"]:
        qa_parts = []
        if "question" in entry["qa"]:
            qa_parts.append(f"Question: {entry['qa']['question']}")
        if "answer" in entry["qa"]:
            qa_parts.append(f"Answer: {entry['qa']['answer']}")
        if "exe_ans" in entry["qa"]:
            qa_parts.append(f"Execution Answer: {entry['qa']['exe_ans']}")
        parts.append("\n".join(qa_parts))
    
    # Extract dialogue break
    dialogue_break = entry.get("annotation", {}).get("dialogue_break", "")
    if dialogue_break:
        parts.append(f"Dialogue Break: {dialogue_break}")
    
    full_text = "\n\n".join(parts)
    return {
        "id": entry_id, 
        "text": full_text, 
        "possible_questions": dialogue_break, 
        "table": table_str,
        "qa": entry.get("qa", {})
    }

In [None]:
def get_table_description(table_content, document_context):
    """Generate table description using GPT-4o-mini."""
    prompt = f"""
    Given the following table and its context from the original document,
    provide a detailed description of the table. Then, include the table in markdown format.

    Original Document Context:
    {document_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    2. The table in markdown format.
    """
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes tables and formats them in markdown."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

In [9]:
def safe_get_table_description(row):
    """Safely generate table description with error handling."""
    table_content = row["table"]
    document_context = row["text"]
    if table_content and table_content.strip():
        try:
            return get_table_description(table_content, document_context)
        except Exception as e:
            print(f"Error processing table description for id {row['id']}: {e}")
            return ""
    return ""

In [49]:
def flatten_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
    """
    Flatten complex metadata into simple types that Chroma can handle.
    """
    flattened = {}
    
    if 'table' in metadata:
        flattened['table'] = str(metadata['table'])
    
    # Extract useful information from the qa dictionary
    if 'qa' in metadata:
        qa = metadata['qa']
        if 'question' in qa:
            flattened['question'] = str(qa['question'])
        if 'answer' in qa:
            flattened['answer'] = str(qa['answer'])
        if 'explanation' in qa:
            flattened['explanation'] = str(qa['explanation'])
        if 'exe_ans' in qa:
            flattened['exe_ans'] = float(qa['exe_ans'])
    
    # Add any table description as a string
    if 'table_description' in metadata:
        flattened['table_description'] = str(metadata['table_description'])
    
    return flattened

### 3. MultiModal RAG Metrics
---

In [None]:
def calculate_bleu(reference, candidate):
    """Calculate BLEU score."""
    smoothie = SmoothingFunction().method1
    reference_tokens = reference.lower().split()
    candidate_tokens = candidate.lower().split()
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)

In [60]:
def calculate_rouge(reference, candidate):
    """Calculate ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {
        'rouge1': {'f': scores['rouge1'].fmeasure},
        'rouge2': {'f': scores['rouge2'].fmeasure},
        'rougeL': {'f': scores['rougeL'].fmeasure}
    }

In [None]:
def calculate_factual_correctness(reference, candidate):
    """Calculate factual correctness score."""
    import re
    
    def extract_numbers(text):
        return set(float(x) for x in re.findall(r'-?\d*\.?\d+', text))
    
    ref_numbers = extract_numbers(reference)
    cand_numbers = extract_numbers(candidate)
    
    if not ref_numbers:
        return 1.0 if not cand_numbers else 0.0
    
    if not cand_numbers:
        return 0.0
    
    correct_numbers = ref_numbers.intersection(cand_numbers)
    return len(correct_numbers) / len(ref_numbers)

In [None]:
def evaluate_qa_response(reference_answer, generated_answer, source_context, retrieved_context):
    """Evaluate QA response using multiple metrics."""
    bleu = calculate_bleu(reference_answer, generated_answer)
    rouge_scores = calculate_rouge(reference_answer, generated_answer)
    factual = calculate_factual_correctness(reference_answer, generated_answer)
    
    source_tokens = set(source_context.lower().split())
    retrieved_tokens = set(retrieved_context.lower().split())
    
    context_precision = len(source_tokens.intersection(retrieved_tokens)) / len(retrieved_tokens) if retrieved_tokens else 0
    context_recall = len(source_tokens.intersection(retrieved_tokens)) / len(source_tokens) if source_tokens else 0
    
    return {
        'bleu': bleu,
        'rouge1_f': rouge_scores['rouge1']['f'],
        'rouge2_f': rouge_scores['rouge2']['f'],
        'rougeL_f': rouge_scores['rougeL']['f'],
        'factual_correctness': factual,
        'context_precision': context_precision,
        'context_recall': context_recall,
        'context_f1': 2 * (context_precision * context_recall) / (context_precision + context_recall) if (context_precision + context_recall) > 0 else 0
    }

### 4. Setup Question and Answer Chain
---

In [52]:
def setup_qa_chain(documents: List[Document], persist_directory: str = "vectorstore"):
    """Set up the QA chain with vector store and embedding model."""
    FIN_PROMPT_TEMPLATE = """
    You are a financial analyst assistant. Use the following pieces of context to answer the question about financial data in tables. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}

    Question: {question}

    Provide a detailed answer with numerical calculations when applicable:
    """

    PROMPT = PromptTemplate(
        template=FIN_PROMPT_TEMPLATE,
        input_variables=["context", "question"]
    )

    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )
    
    return qa_chain

### 5. Load and Process Data
---

In [25]:
with open('./data/convfinqatrain.json', 'r') as f:
        json_data = json.load(f)
test_data = json_data.copy()
test_data = test_data[:50]  # Using first 50 examples for calculating metrics

In [26]:
# Process entries and exclude any entries with missing qa
processed_data = [processed for processed in (process_json_entry(entry) for entry in test_data) if processed is not None]
df = pd.DataFrame(processed_data)

In [27]:
df.head()

Unnamed: 0,id,text,possible_questions,table,qa
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 30 2009 is $ 174247 ;'}, 'exe_ans': 0.14136, 'program_re': 'divide(subtract(206588, 181001), 181001)'}"
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"{'question': 'what was the percent of the growth in the revenues from 2007 to 2008', 'answer': '1.3%', 'explanation': 'the percent growth of the revenue is the difference between the 2 divide by the oldest amount', 'ann_table_rows': [1], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '9362.2', 'arg2': '9244.9', 'res': '117.3'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '9244.9', 'res': '1.3%'}], 'program': 'subtract(9362.2, 9244.9), divide(#0, 9244.9)', 'gold_inds': {'table_1': 'the revenue of year ended december 31 2008 ( unaudited ) is $ 9362.2 ; the revenue of year ended december 31 2007 ( unaudited ) is $ 9244.9 ;'}, 'exe_ans': 0.01269, 'program_re': 'divide(subtract(9362.2, 9244.9), 9244.9)'}"
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"{'question': 'what was the percentage change in net sales from 2000 to 2001?', 'answer': '-32%', 'explanation': '', 'ann_table_rows': [1], 'ann_text_rows': [], 'steps': [{'op': 'minus1-1', 'arg1': '5363', 'arg2': '7983', 'res': '-2620'}, {'op': 'divide1-2', 'arg1': '#0', 'arg2': '7983', 'res': '-32%'}], 'program': 'subtract(5363, 7983), divide(#0, 7983)', 'gold_inds': {'table_1': 'the net sales of 2002 is $ 5742 ; the net sales of 2001 is $ 5363 ; the net sales of 2000 is $ 7983 ;'}, 'exe_ans': -0.3282, 'program_re': 'divide(subtract(5363, 7983), 7983)'}"
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"{'question': 'what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?', 'answer': '-26.16%', 'explanation': '', 'ann_table_rows': [1, 2], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '75.95', 'arg2': 'const_100', 'res': '-24.05'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': 'const_100', 'res': '-24.05%'}, {'op': 'minus2-3', 'arg1': '102.11', 'arg2': 'const_100', 'res': '2.11'}, {'op': 'divide2-4', 'arg1': '#2', 'arg2': 'const_100', 'res': '2.11%'}, {'op': 'minus2-5', 'arg1': '#1', 'arg2': '#3', 'res': '-26.16%'}], 'program': 'subtract(75.95, const_100), divide(#0, const_100), subtract(102.11, const_100), divide(#2, const_100), subtract(#1, #3)', 'gold_inds': {'table_1': 'the united parcel service inc . of 12/31/04 is $ 100.00 ; the united parcel service inc . of 12/31/05 is $ 89.49 ; the united parcel service inc . of 12/31/06 is $ 91.06 ; the united parce..."
4,Single_CE/2010/page_134.pdf-2,"tax returns for 2001 and beyond are open for examination under statute . currently , unrecognized tax benefits are not expected to change significantly over the next 12 months . 19 . stock-based and other management compensation plans in april 2009 , the company approved a global incentive plan which replaces the company 2019s 2004 stock incentive plan . the 2009 global incentive plan ( 201cgip 201d ) enables the compensation committee of the board of directors to award incentive and nonqualified stock options , stock appreciation rights , shares of series a common stock , restricted stock , restricted stock units ( 201crsus 201d ) and incentive bonuses ( which may be paid in cash or stock or a combination thereof ) , any of which may be performance-based , with vesting and other award provisions that provide effective incentive to company employees ( including officers ) , non-management directors and other service providers . under the 2009 gip , the company no longer can grant r...","[how many shares are subject to outstanding awards is under the 2009 global incentive plan?, what about under the 2004 stock incentive plan?, how many total shares are subject to outstanding awards?, what about under the 2004 stock incentive plan?, what proportion does this represent?]",| shares available for awards | shares subject to outstanding awards\n2009 global incentive plan | 2322450 | 2530454\n2004 stock incentive plan | - | 5923147,"{'question': 'what portion of the total shares subject to outstanding awards is under the 2009 global incentive plan?', 'answer': '70.1%', 'explanation': '', 'ann_table_rows': [1, 2], 'ann_text_rows': [], 'steps': [{'op': 'add2-1', 'arg1': '2530454', 'arg2': '5923147', 'res': '8453601'}, {'op': 'divide2-2', 'arg1': '5923147', 'arg2': '#0', 'res': '70.1%'}], 'program': 'add(2530454, 5923147), divide(5923147, #0)', 'gold_inds': {'table_1': 'the 2009 global incentive plan of shares available for awards is 2322450 ; the 2009 global incentive plan of shares subject to outstanding awards is 2530454 ;', 'table_2': 'the 2004 stock incentive plan of shares available for awards is - ; the 2004 stock incentive plan of shares subject to outstanding awards is 5923147 ;'}, 'exe_ans': 0.70067, 'program_re': 'divide(5923147, add(2530454, 5923147))'}"


In [29]:
# Generate table descriptions
df["table_description"] = df.apply(safe_get_table_description, axis=1)

In [30]:
df["expanded_text"] = df["text"] + "\n\n" + df["table_description"]
df.head()

Unnamed: 0,id,text,possible_questions,table,qa,table_description,expanded_text
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"{'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'ann_table_rows': [6], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '206588', 'arg2': '181001', 'res': '25587'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '181001', 'res': '14.1%'}], 'program': 'subtract(206588, 181001), divide(#0, 181001)', 'gold_inds': {'table_6': '2008 the net cash from operating activities of year ended june 30 2009 2008 is $ 206588 ; the net cash from operating activities of year ended june 30 2009 2008 is $ 181001 ; the net cash from operating activities of year ended june 30 2009 is $ 174247 ;'}, 'exe_ans': 0.14136, 'program_re': 'divide(subtract(206588, 181001), 181001)'}","### Table Description\n\nThe table summarizes the net cash from operating activities for the fiscal years ended June 30, 2009, 2008, and 2007. Each row in the table presents different metrics related to cash flows, specifically focusing on components that influence net cash generated from operations. \n\n1. **Net Income**: This shows the total income after expenses for each fiscal year. It decreased slightly from 2008 to 2009.\n \n2. **Non-Cash Expenses**: These represent expenses that affect net income but do not involve cash transactions. There was an increase from 2007 to 2008 and from 2008 to 2009.\n \n3. **Change in Receivables**: This indicates the changes in money owed to the company from customers. A notable positive change of $21,214 in 2009 compared to a negative change of $2,913 in 2008 suggests improved cash collection practices.\n \n4. **Change in Deferred Revenue**: This metric reflects the changes in revenue that has been collected but not yet earned. It shows sig...","26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re..."
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"{'question': 'what was the percent of the growth in the revenues from 2007 to 2008', 'answer': '1.3%', 'explanation': 'the percent growth of the revenue is the difference between the 2 divide by the oldest amount', 'ann_table_rows': [1], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '9362.2', 'arg2': '9244.9', 'res': '117.3'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': '9244.9', 'res': '1.3%'}], 'program': 'subtract(9362.2, 9244.9), divide(#0, 9244.9)', 'gold_inds': {'table_1': 'the revenue of year ended december 31 2008 ( unaudited ) is $ 9362.2 ; the revenue of year ended december 31 2007 ( unaudited ) is $ 9244.9 ;'}, 'exe_ans': 0.01269, 'program_re': 'divide(subtract(9362.2, 9244.9), 9244.9)'}","### Description of the Table\n\nThe table presents unaudited pro forma financial information for the years ended December 31, 2008, and December 31, 2007. This information is intended to provide an illustrative comparison of selected financial metrics that would have been applicable if the merger with Allied had been completed on January 1, 2007. The figures are presented in millions of dollars, except for share and per share amounts.\n\nThe table includes the following categories:\n\n- **Revenue**: This row shows the total revenue for both years, with revenue increasing from $9,244.9 million in 2007 to $9,362.2 million in 2008, indicating a modest growth in revenue.\n \n- **Income from Continuing Operations Available to Common Stockholders**: This figure reflects the earnings available to common shareholders from ongoing operations, which decreased from $423.2 million in 2007 to $285.7 million in 2008.\n \n- **Basic Earnings Per Share**: This represents the earnings allocated to...","substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244...."
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"{'question': 'what was the percentage change in net sales from 2000 to 2001?', 'answer': '-32%', 'explanation': '', 'ann_table_rows': [1], 'ann_text_rows': [], 'steps': [{'op': 'minus1-1', 'arg1': '5363', 'arg2': '7983', 'res': '-2620'}, {'op': 'divide1-2', 'arg1': '#0', 'arg2': '7983', 'res': '-32%'}], 'program': 'subtract(5363, 7983), divide(#0, 7983)', 'gold_inds': {'table_1': 'the net sales of 2002 is $ 5742 ; the net sales of 2001 is $ 5363 ; the net sales of 2000 is $ 7983 ;'}, 'exe_ans': -0.3282, 'program_re': 'divide(subtract(5363, 7983), 7983)'}","### Comprehensive Description of the Table\n\nThe table presents a summary of financial performance metrics over three fiscal years: 2000, 2001, and 2002. It focuses on the company's net sales, cost of sales, gross margin, and gross margin percentage for each of those years. \n\n1. **Net Sales**: This row shows the total revenue generated by the company from sales of its products. In 2000, net sales were $7,983 million, which decreased to $5,363 million in 2001—a significant drop of approximately 32%. However, there was a slight recovery in 2002, with net sales increasing to $5,742 million.\n\n2. **Cost of Sales**: This row lists the total expenses directly associated with the production of goods sold. The costs remained relatively stable from 2000 to 2001, changing from $5,817 million in 2000 to $4,128 million in 2001. In 2002, the cost slightly increased to $4,139 million.\n\n3. **Gross Margin**: This row indicates the gross profit of the company calculated as net sales minus the...","in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a..."
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"{'question': 'what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?', 'answer': '-26.16%', 'explanation': '', 'ann_table_rows': [1, 2], 'ann_text_rows': [], 'steps': [{'op': 'minus2-1', 'arg1': '75.95', 'arg2': 'const_100', 'res': '-24.05'}, {'op': 'divide2-2', 'arg1': '#0', 'arg2': 'const_100', 'res': '-24.05%'}, {'op': 'minus2-3', 'arg1': '102.11', 'arg2': 'const_100', 'res': '2.11'}, {'op': 'divide2-4', 'arg1': '#2', 'arg2': 'const_100', 'res': '2.11%'}, {'op': 'minus2-5', 'arg1': '#1', 'arg2': '#3', 'res': '-26.16%'}], 'program': 'subtract(75.95, const_100), divide(#0, const_100), subtract(102.11, const_100), divide(#2, const_100), subtract(#1, #3)', 'gold_inds': {'table_1': 'the united parcel service inc . of 12/31/04 is $ 100.00 ; the united parcel service inc . of 12/31/05 is $ 89.49 ; the united parcel service inc . of 12/31/06 is $ 91.06 ; the united parce...","### Table Description\n\nThe table presents cumulative total returns for three different investment options over a five-year period, from December 31, 2004, to December 31, 2009. The three options compared are:\n\n1. **United Parcel Service Inc. (UPS)** - Represented as ""united parcel service inc."" in the table.\n2. **S&P 500 Index** - A stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States.\n3. **Dow Jones Transportation Average** - A stock market index that measures the performance of transportation sector companies.\n\nThe values shown in the table represent the dollar amount that an investment of $100 on December 31, 2004, would yield at the end of each subsequent year listed in the columns. The values are displayed in dollar amounts, and each row provides the cumulative investment value at the end of each specified date for the respective investment option.\n\nThe table reveals trends over the five-year per...","( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide..."
4,Single_CE/2010/page_134.pdf-2,"tax returns for 2001 and beyond are open for examination under statute . currently , unrecognized tax benefits are not expected to change significantly over the next 12 months . 19 . stock-based and other management compensation plans in april 2009 , the company approved a global incentive plan which replaces the company 2019s 2004 stock incentive plan . the 2009 global incentive plan ( 201cgip 201d ) enables the compensation committee of the board of directors to award incentive and nonqualified stock options , stock appreciation rights , shares of series a common stock , restricted stock , restricted stock units ( 201crsus 201d ) and incentive bonuses ( which may be paid in cash or stock or a combination thereof ) , any of which may be performance-based , with vesting and other award provisions that provide effective incentive to company employees ( including officers ) , non-management directors and other service providers . under the 2009 gip , the company no longer can grant r...","[how many shares are subject to outstanding awards is under the 2009 global incentive plan?, what about under the 2004 stock incentive plan?, how many total shares are subject to outstanding awards?, what about under the 2004 stock incentive plan?, what proportion does this represent?]",| shares available for awards | shares subject to outstanding awards\n2009 global incentive plan | 2322450 | 2530454\n2004 stock incentive plan | - | 5923147,"{'question': 'what portion of the total shares subject to outstanding awards is under the 2009 global incentive plan?', 'answer': '70.1%', 'explanation': '', 'ann_table_rows': [1, 2], 'ann_text_rows': [], 'steps': [{'op': 'add2-1', 'arg1': '2530454', 'arg2': '5923147', 'res': '8453601'}, {'op': 'divide2-2', 'arg1': '5923147', 'arg2': '#0', 'res': '70.1%'}], 'program': 'add(2530454, 5923147), divide(5923147, #0)', 'gold_inds': {'table_1': 'the 2009 global incentive plan of shares available for awards is 2322450 ; the 2009 global incentive plan of shares subject to outstanding awards is 2530454 ;', 'table_2': 'the 2004 stock incentive plan of shares available for awards is - ; the 2004 stock incentive plan of shares subject to outstanding awards is 5923147 ;'}, 'exe_ans': 0.70067, 'program_re': 'divide(5923147, add(2530454, 5923147))'}","### Description of the Table\n\nThe table presents information regarding the shares available for awards and shares subject to outstanding awards under two different stock incentive plans as of December 31, 2010. It compares the 2009 Global Incentive Plan (GIP) and the 2004 Stock Incentive Plan (SIP). \n\n1. **Shares Available for Awards**: This column indicates how many shares are currently available to be awarded under each plan. \n - For the 2009 GIP, there are 2,322,450 shares available.\n - For the 2004 SIP, there are no shares available for new awards (indicated by a dash).\n\n2. **Shares Subject to Outstanding Awards**: This column shows the number of shares that have been awarded but are yet to be fully utilized or completed under each plan. \n - The 2009 GIP has 2,530,454 shares subject to outstanding awards.\n - The 2004 SIP has a significantly higher number, with 5,923,147 shares subject to outstanding awards.\n\nOverall, the table highlights the allocation of sh...","tax returns for 2001 and beyond are open for examination under statute . currently , unrecognized tax benefits are not expected to change significantly over the next 12 months . 19 . stock-based and other management compensation plans in april 2009 , the company approved a global incentive plan which replaces the company 2019s 2004 stock incentive plan . the 2009 global incentive plan ( 201cgip 201d ) enables the compensation committee of the board of directors to award incentive and nonqualified stock options , stock appreciation rights , shares of series a common stock , restricted stock , restricted stock units ( 201crsus 201d ) and incentive bonuses ( which may be paid in cash or stock or a combination thereof ) , any of which may be performance-based , with vesting and other award provisions that provide effective incentive to company employees ( including officers ) , non-management directors and other service providers . under the 2009 gip , the company no longer can grant r..."


### 6. Load into ChromaDB Vector Store
---

In [38]:
# Create documents for ChromaDB vector store
documents = []
documents = [
    Document(
        page_content=row["expanded_text"],
        metadata={
            "id": row["id"],
            "table": row["table"],
            "table_description": row["table_description"],
            "qa": row["qa"]}
        )
        for _, row in df.iterrows()]

In [48]:
documents[:5]

[Document(metadata={'id': 'Single_JKHY/2009/page_28.pdf-3', 'table': '2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247', 'table_description': "### Table Description\n\nThe table summarizes the net cash from operating activities for the fiscal years ended June 30, 2009, 2008, and 2007. Each row in the table presents different metrics related to cash flows, specifically focusing on components that influence net cash generated from operations. \n\n1. **Net Income**: This shows the total income after expenses for each fiscal year. It decreased slightly from 2008 to 2009.\n  \n2. **Non-Cash Expenses**: Th

In [41]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

persist_directory = "vectorstore"

In [50]:
# Filter complex metadata before creating documents
filtered_documents = []
for doc in documents:
    # Create a copy of the document to avoid modifying the original
    filtered_doc = deepcopy(doc)
    
    # If the document has metadata, filter it
    if hasattr(filtered_doc, 'metadata'):
        filtered_doc.metadata = flatten_metadata(filtered_doc.metadata)
    
    filtered_documents.append(filtered_doc)

In [53]:
filtered_documents[:5]

[Document(metadata={'table': '2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247', 'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1%', 'explanation': '', 'exe_ans': 0.14136, 'table_description': "### Table Description\n\nThe table summarizes the net cash from operating activities for the fiscal years ended June 30, 2009, 2008, and 2007. Each row in the table presents different metrics related to cash flows, specifically focusing on components that influence net cash generated from operations. \n\n1. **Net Income**: This shows the 

In [51]:
vectorstore = Chroma.from_documents(
    documents=filtered_documents,
    embedding=embedding_model,
    persist_directory=persist_directory
)
vectorstore.persist()

  vectorstore.persist()


In [56]:
# Setup QA chain
qa_chain = setup_qa_chain(filtered_documents)

  llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


### 7. Evaluate System
---

In [61]:
# Evaluate system
all_results = []
for entry in processed_data:
    question = entry["qa"]["question"]
    result = qa_chain({"query": question})
        
    evaluation = evaluate_qa_response(
        reference_answer=entry["qa"]["answer"],
        generated_answer=result["result"],
        source_context=entry["text"],
        retrieved_context="\n".join([doc.page_content for doc in result["source_documents"]])
        )
        
    result_dict = {
        "question": question,
        "reference_answer": entry["qa"]["answer"],
        "generated_answer": result["result"],
        **evaluation}
    all_results.append(result_dict)

In [63]:
# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df.head()

Unnamed: 0,question,reference_answer,generated_answer,bleu,rouge1_f,rouge2_f,rougeL_f,factual_correctness,context_precision,context_recall,context_f1
0,what was the percentage change in the net cash from operating activities from 2008 to 2009,14.1%,"To calculate the percentage change in net cash from operating activities from 2008 to 2009, we can use the following formula:\n\n\[\n\text{Percentage Change} = \left( \frac{\text{New Value} - \text{Old Value}}{\text{Old Value}} \right) \times 100\n\]\n\nFrom the provided data:\n\n- Net cash from operating activities in 2008: **$181,001**\n- Net cash from operating activities in 2009: **$206,588**\n\nNow, we can plug these values into the formula:\n\n1. Calculate the difference:\n \[\n \text{Difference} = 206,588 - 181,001 = 25,587\n \]\n\n2. Calculate the percentage change:\n \[\n \text{Percentage Change} = \left( \frac{25,587}{181,001} \right) \times 100 \approx 14.14\%\n \]\n\nThus, the percentage change in net cash from operating activities from 2008 to 2009 is approximately **14.1%**.",0.0,0.033058,0.016807,0.033058,1.0,0.433333,1.0,0.604651
1,what was the percent of the growth in the revenues from 2007 to 2008,1.3%,"To calculate the percentage growth in revenues from 2007 to 2008, we need the revenue figures for both years. However, the provided context does not specify the exact revenue amounts for 2007 and 2008. \n\nThe context mentions that revenues in the credit union systems and services business segment increased by 14% from fiscal 2007 to fiscal 2008, but it does not provide the actual revenue figures for those years.\n\nWithout the specific revenue amounts for 2007 and 2008, I cannot calculate the percentage growth in revenues. Therefore, I don't know the answer.",0.0,0.0,0.0,0.0,0.0,0.132629,0.385666,0.19738
2,what was the percentage change in net sales from 2000 to 2001?,-32%,"To calculate the percentage change in net sales from 2000 to 2001, we can use the following formula:\n\n\[\n\text{Percentage Change} = \left( \frac{\text{New Value} - \text{Old Value}}{\text{Old Value}} \right) \times 100\n\]\n\nIn this case:\n- The net sales in 2000 (Old Value) = $7,983 million\n- The net sales in 2001 (New Value) = $5,363 million\n\nNow, we can plug in the values:\n\n\[\n\text{Percentage Change} = \left( \frac{5,363 - 7,983}{7,983} \right) \times 100\n\]\n\nCalculating the difference:\n\n\[\n5,363 - 7,983 = -2,620\n\]\n\nNow, substituting back into the formula:\n\n\[\n\text{Percentage Change} = \left( \frac{-2,620}{7,983} \right) \times 100 \approx -32.8\%\n\]\n\nThus, the percentage change in net sales from 2000 to 2001 is approximately **-32.8%**. \n\nTherefore, the answer is:\n**-32.8%**.",0.0,0.015267,0.0,0.015267,0.0,0.430504,1.0,0.601892
3,what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?,-26.16%,"To calculate the difference in percentage cumulative return on investment for United Parcel Service Inc. (UPS) compared to the S&P 500 index for the five-year period ended December 31, 2009, we can follow these steps:\n\n1. **Determine the cumulative return for UPS and the S&P 500 index:**\n - For UPS:\n - Value at 12/31/04: $100.00\n - Value at 12/31/09: $75.95\n - Change in value: $75.95 - $100.00 = -$24.05\n - Percentage change: \((-24.05 / 100) \times 100 = -24.05\%\)\n\n - For the S&P 500 index:\n - Value at 12/31/04: $100.00\n - Value at 12/31/09: $102.11\n - Change in value: $102.11 - $100.00 = $2.11\n - Percentage change: \((2.11 / 100) \times 100 = 2.11\%\)\n\n2. **Calculate the difference in percentage cumulative return:**\n - Difference = Percentage change for UPS - Percentage change for S&P 500\n - Difference = \(-24.05\% - 2.11\% = -26.16\%\)\n\nThus, the difference in percentage cumulative return on investment for United Parcel ...",0.0,0.021858,0.01105,0.021858,1.0,0.446341,1.0,0.617201
4,what portion of the total shares subject to outstanding awards is under the 2009 global incentive plan?,70.1%,"To determine the portion of the total shares subject to outstanding awards that is under the 2009 Global Incentive Plan (GIP), we need to use the data provided in the context.\n\nFrom the table, we have the following information:\n\n- Shares subject to outstanding awards under the 2009 Global Incentive Plan: **2,530,454**\n- Shares subject to outstanding awards under the 2004 Stock Incentive Plan: **5,923,147**\n\nFirst, we calculate the total shares subject to outstanding awards:\n\n\[\n\text{Total shares subject to outstanding awards} = \text{Shares under 2009 GIP} + \text{Shares under 2004 SIP}\n\]\n\[\n\text{Total shares subject to outstanding awards} = 2,530,454 + 5,923,147 = 8,453,601\n\]\n\nNext, we calculate the portion of the total shares that are under the 2009 Global Incentive Plan:\n\n\[\n\text{Portion under 2009 GIP} = \frac{\text{Shares under 2009 GIP}}{\text{Total shares subject to outstanding awards}}\n\]\n\[\n\text{Portion under 2009 GIP} = \frac{2,530,454}{8,453,6...",0.0,0.0,0.0,0.0,0.0,0.33875,1.0,0.506069


In [64]:
# Calculate aggregate metrics
aggregate_metrics = {
    "mean_bleu": results_df["bleu"].mean(),
    "mean_rouge1": results_df["rouge1_f"].mean(),
    "mean_rouge2": results_df["rouge2_f"].mean(),
    "mean_rougeL": results_df["rougeL_f"].mean(),
    "mean_factual": results_df["factual_correctness"].mean(),
    "mean_context_precision": results_df["context_precision"].mean(),
    "mean_context_recall": results_df["context_recall"].mean(),
    "mean_context_f1": results_df["context_f1"].mean()
}
    
# Print results
print("\nAggregate Evaluation Metrics:")
for metric, value in aggregate_metrics.items():
    print(f"{metric}: {value:.4f}")


Aggregate Evaluation Metrics:
mean_bleu: 0.0001
mean_rouge1: 0.0199
mean_rouge2: 0.0070
mean_rougeL: 0.0191
mean_factual: 0.5385
mean_context_precision: 0.3244
mean_context_recall: 0.9842
mean_context_f1: 0.4826
