In [2]:
import json
import os
import pandas as pd

from dotenv import load_dotenv
from typing import List, Dict, Any
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

In [4]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("Issue detected with API key")
    
openai = OpenAI()

API key looks good so far


In [5]:
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

### 1. Load JSON data
---

In [6]:
# Load JSON data
with open('./data/convfinqatrain.json', 'r') as f:
    json_data = json.load(f)
data = json_data[:5]

### 2. Helper Functions for processing financial documents encoded as JSON file
---

In [7]:
def table_to_string(table):
    """Convert table to string representation."""
    return "\n".join(" | ".join(str(cell) for cell in row) for row in table)

In [8]:
def process_json_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
    """Process a single JSON entry to extract id, merged text, and table string."""
    entry_id = entry.get("id", None)
    parts = []

    # Merge pre_text if available
    if "pre_text" in entry and entry["pre_text"]:
        parts.append(" ".join(entry["pre_text"]))

    # Convert table to a string if it exists
    table_str = table_to_string(entry["table"]) if "table" in entry and entry["table"] else ""
    if table_str:
        parts.append(table_str)

    # Merge post_text if available
    if "post_text" in entry and entry["post_text"]:
        parts.append(" ".join(entry["post_text"]))

    # Merge QA details
    if "qa" in entry and entry["qa"]:
        qa_parts = []
        if "question" in entry["qa"]:
            qa_parts.append(f"Question: {entry['qa']['question']}")
        if "answer" in entry["qa"]:
            qa_parts.append(f"Answer: {entry['qa']['answer']}")
        if "exe_ans" in entry["qa"]:
            qa_parts.append(f"Execution Answer: {entry['qa']['exe_ans']}")
        parts.append("\n".join(qa_parts))
    
    # Extract and Merge dialogue break
    dialogue_break = entry.get("annotation", {}).get("dialogue_break", "")
    if dialogue_break:
        parts.append(f"Dialogue Break: {dialogue_break}")

    # Combine all parts into a single text field
    full_text = "\n\n".join(parts)

    return {"id": entry_id, "text": full_text, "possible_questions":dialogue_break, "table": table_str}    

In [9]:
def get_table_description(table_content: str, document_context: str) -> str:
    """
    Given the table content and the document context, use OpenAI's ChatCompletion API
    to generate a detailed description of the table and include the table in markdown format.
    """
    prompt = f"""
    Given the following table and its context from the original document,
    provide a detailed description of the table. Then, include the table in markdown format.

    Original Document Context:
    {document_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    2. The table in markdown format.
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes tables and formats them in markdown."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

### 3. Process each JSON entry
---

In [10]:
processed_data = [process_json_entry(entry) for entry in data]

In [11]:
df = pd.DataFrame(processed_data)
df

Unnamed: 0,id,text,possible_questions,table
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % )
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88


### 4. Generate contextualised table description
---

In [12]:
def safe_get_table_description(row):
    table_content = row["table"]
    document_context = row["text"]
    if table_content and table_content.strip():
        try:
            return get_table_description(table_content, document_context)
        except Exception as e:
            print(f"Error processing table description for id {row['id']}: {e}")
            return ""
    else:
        return ""

In [13]:
df["table_description"] = df.apply(safe_get_table_description, axis=1)

In [14]:
df

Unnamed: 0,id,text,possible_questions,table,table_description
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"### Description of the Table\n\nThe table summarizes the net cash from operating activities for a company over the fiscal years ending June 30 from 2007 to 2009. It includes detailed line items that contribute to the calculation of net cash from operating activities, which are crucial for understanding the company's financial performance and cash flow situation.\n\n1. **Lines Item Breakdown:**\n - **Net Income:** This row shows the net income for each of the three fiscal years. It indicates a slight decrease in net income from 2007 to 2009.\n - **Non-Cash Expenses:** This represents various non-cash charges such as depreciation and amortization, which are added back to net income to determine cash flows.\n - **Change in Receivables:** This indicates the net change in accounts receivable, showing how much cash was collected as compared to how much was billed.\n - **Change in Deferred Revenue:** This row reflects changes in deferred revenue, which contributes to cash inflows ..."
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"### Description of the Table\n\nThe table presents the unaudited pro forma financial information for Republic Services, Inc. for the years ended December 31, 2008, and December 31, 2007. This information reflects the operating results of Allied, following the acquisition, and assumes that the merger was completed as of January 1, 2007. \n\nThe table contains four key financial metrics for both years:\n\n1. **Revenue**: This item indicates total revenue generated by the company, which shows a slight increase from $9,244.9 million in 2007 to $9,362.2 million in 2008.\n\n2. **Income from Continuing Operations Available to Common Stockholders**: This represents the net income attributable to shareholders from ongoing operations, which decreased from $423.2 million in 2007 to $285.7 million in 2008.\n\n3. **Basic Earnings per Share (EPS)**: This metric provides the earnings available to each common share, which declined from $1.10 in 2007 to $0.76 in 2008.\n\n4. **Diluted Earnings per S..."
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"### Comprehensive Description of the Table:\nThe table presents financial data for a company over three consecutive fiscal years, specifically 2000, 2001, and 2002. The information includes four main financial metrics: net sales, cost of sales, gross margin, and gross margin percentage.\n\n1. **Net Sales**: This row shows the total revenue generated from sales of goods before any deductions like returns or allowances. In 2000, the net sales were $7,983 million, indicating a peak year. However, in 2001, net sales declined to $5,363 million, representing a significant decrease of 32% compared to 2000. The net sales recovered slightly to $5,742 million in 2002, but they still remained below the figures from 2000.\n\n2. **Cost of Sales**: This metric captures the expenses directly associated with the production of the goods sold. The cost of sales remained relatively stable over the three years, showing minor changes. In 2000, the cost was $5,817 million, slightly decreased to $4,128 m..."
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe table presents a comparative analysis of cumulative total returns for three investment entities over a five-year period from December 31, 2004, to December 31, 2009. The entities compared are United Parcel Service Inc. (UPS), the S&P 500 Index, and the Dow Jones Transportation Average. Each column in the table represents the total investment value at the end of each year for the respective date, where an initial investment of $100 is assumed at the start of the period (December 31, 2004).\n\n- **Row 1**: United Parcel Service Inc. shows a decreasing trend in total returns from $100.00 in 2004 to $75.95 in 2009, indicating a decline in value over the period.\n \n- **Row 2**: The S&P 500 Index displays a generally upward trend, increasing from $100.00 in 2004 to $102.11 in 2009. It shows a dip in 2008, but it still manages to end higher than the starting value.\n \n- **Row 3**: The Dow Jones Transportation Average also starts at $100.00 in 2004, ..."
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe table provides a five-year comparison of cumulative total shareowner returns for three entities: United Parcel Service Inc. (UPS), the S&P 500 Index, and the Dow Jones Transportation Average. It illustrates the value of a hypothetical investment of $100 made on December 31, 2004, and how that investment would have changed over the subsequent years through to December 31, 2009.\n\nThe first row showcases the returns for United Parcel Service Inc., which starts at $100.00 in 2004 and experiences fluctuations in value over the following years, with a decline to $89.49 in 2005, followed by slight recoveries leading to $91.06 in 2006. However, it continues to decline in value, reaching $70.48 by 2008, and slightly recovering to $75.95 in 2009.\n\nThe second row outlines the performance of the S&P 500 index, which initially matches the $100 investment in 2004 and rises in value to $104.91 in 2005, continuing to increase through 2006 and 2007, peaking a..."


### 5. Append contextualised table field to text field
---

In [15]:
df["expanded_text"] = df["text"] + "\n\n" + df["table_description"]

### 6. Run embedding model
---

In [None]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
df["embedding"] = df["expanded_text"].apply(lambda x: embedding_model.embed_query(x) if x else [])
df

  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


Unnamed: 0,id,text,possible_questions,table,table_description,expanded_text,embedding
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"### Description of the Table\n\nThe table summarizes the net cash from operating activities for a company over the fiscal years ending June 30 from 2007 to 2009. It includes detailed line items that contribute to the calculation of net cash from operating activities, which are crucial for understanding the company's financial performance and cash flow situation.\n\n1. **Lines Item Breakdown:**\n - **Net Income:** This row shows the net income for each of the three fiscal years. It indicates a slight decrease in net income from 2007 to 2009.\n - **Non-Cash Expenses:** This represents various non-cash charges such as depreciation and amortization, which are added back to net income to determine cash flows.\n - **Change in Receivables:** This indicates the net change in accounts receivable, showing how much cash was collected as compared to how much was billed.\n - **Change in Deferred Revenue:** This row reflects changes in deferred revenue, which contributes to cash inflows ...","26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[-0.04168519960464352, 0.025579553964186694, 0.05294714289388926, 0.021242906443692415, 0.052226589821835034, -0.01531837634704506, -0.02178999137975278, 0.01758677816461867, 0.010514704101177505, 0.01606561367246821, 0.00743234827116193, -0.0066917820090810105, -0.023177717841252915, -0.03133061778748526, 0.03952354783906115, 0.03955023209243008, -0.008653281850961859, -0.002760443496141524, -0.03346558157440856, -0.013837244754205759, 0.03805575744158378, 0.011875745377986179, 0.01631914201914916, 0.014210863416917335, 0.01435764264292958, 0.025192591312145577, -0.056416460910284676, -0.015651964904413094, 0.00482702329587171, -0.025632928058859774, 0.02674044192031004, -0.020509011244953728, 0.039203303271022664, 0.024138451545368398, -4.0239092665957724e-05, -0.00507387871656535, -0.026526945541617712, 0.009073602613681748, 0.041605139393956436, -0.037121711716127374, 0.018454107668717523, -0.005917857170660049, -0.007218851892469601, -0.020082018487569072, -0.06186062229222922..."
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"### Description of the Table\n\nThe table presents the unaudited pro forma financial information for Republic Services, Inc. for the years ended December 31, 2008, and December 31, 2007. This information reflects the operating results of Allied, following the acquisition, and assumes that the merger was completed as of January 1, 2007. \n\nThe table contains four key financial metrics for both years:\n\n1. **Revenue**: This item indicates total revenue generated by the company, which shows a slight increase from $9,244.9 million in 2007 to $9,362.2 million in 2008.\n\n2. **Income from Continuing Operations Available to Common Stockholders**: This represents the net income attributable to shareholders from ongoing operations, which decreased from $423.2 million in 2007 to $285.7 million in 2008.\n\n3. **Basic Earnings per Share (EPS)**: This metric provides the earnings available to each common share, which declined from $1.10 in 2007 to $0.76 in 2008.\n\n4. **Diluted Earnings per S...","substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[-0.023219507650746818, 0.0016361440392597423, 0.07101111255665597, 0.01671015504550496, 0.01776686904999899, 0.020232532576958142, 0.002194440734571393, 0.02452983197323811, -0.01286372009246029, 0.025783798240324993, 0.00010847820010808295, -0.029447069978966615, 0.02250094384132445, -0.0034801082914079316, 0.04486098975017008, -0.0031349154348865537, 0.008566420696180322, -0.026615078160556187, -0.05906321226150128, -0.0068334109581559355, 0.046946239662778534, 0.01804865932702105, 0.0009519223430076888, 0.0040401663505739566, -0.022261420708871806, 0.021979630431849743, -0.05193392290945607, -0.04418469308531714, -0.020443875750385986, -0.013293450032088287, 0.027390001888028155, -0.028615788196090234, 0.0373935520656984, 0.0005952816691707963, -0.026502363912392548, -0.005501952597718636, -0.0120113047372992, -0.00015586517421825062, 0.02102154535261992, -0.012715780429854355, -0.010327608530584322, -0.02305043348453358, -0.03232133080459165, -0.00828463041915826, -0.062388339..."
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"### Comprehensive Description of the Table:\nThe table presents financial data for a company over three consecutive fiscal years, specifically 2000, 2001, and 2002. The information includes four main financial metrics: net sales, cost of sales, gross margin, and gross margin percentage.\n\n1. **Net Sales**: This row shows the total revenue generated from sales of goods before any deductions like returns or allowances. In 2000, the net sales were $7,983 million, indicating a peak year. However, in 2001, net sales declined to $5,363 million, representing a significant decrease of 32% compared to 2000. The net sales recovered slightly to $5,742 million in 2002, but they still remained below the figures from 2000.\n\n2. **Cost of Sales**: This metric captures the expenses directly associated with the production of the goods sold. The cost of sales remained relatively stable over the three years, showing minor changes. In 2000, the cost was $5,817 million, slightly decreased to $4,128 m...","in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[-0.03333227424071934, 0.01174629417407653, 0.05807815644901063, -0.01207961639494307, 0.03650550640272888, 0.024225896861530406, -0.01332624384791687, 0.02751912621514482, 0.007006444221232913, 0.04474524526260674, 0.04591854409941704, -0.004976508804164668, -0.038798769093743676, -0.0041098693535309904, 0.014586203183896926, 0.027052473988344546, 0.019226056097951544, -0.01199295217048292, -0.04826513432245686, -0.019012729206044687, 0.06570457839918044, 0.027652456221078554, -0.022625947966196786, 0.001459953646388708, -0.026972477102371425, 0.049545095673898, -0.0024449222898522602, -0.04666518542712324, 0.016492810534084575, -0.025612520727602367, 0.006939779683927345, -0.023532586791139624, 0.02253261826589481, 0.010979651762231854, -0.006009809365231614, -0.009033046900380244, -0.008073076818121992, -0.0019099393896166135, 0.048291799951114565, -0.02805244251358936, -0.011139646465500697, -0.009133043007846646, -0.00379321291774487, 0.01925272172660925, -0.011766293395569812..."
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe table presents a comparative analysis of cumulative total returns for three investment entities over a five-year period from December 31, 2004, to December 31, 2009. The entities compared are United Parcel Service Inc. (UPS), the S&P 500 Index, and the Dow Jones Transportation Average. Each column in the table represents the total investment value at the end of each year for the respective date, where an initial investment of $100 is assumed at the start of the period (December 31, 2004).\n\n- **Row 1**: United Parcel Service Inc. shows a decreasing trend in total returns from $100.00 in 2004 to $75.95 in 2009, indicating a decline in value over the period.\n \n- **Row 2**: The S&P 500 Index displays a generally upward trend, increasing from $100.00 in 2004 to $102.11 in 2009. It shows a dip in 2008, but it still manages to end higher than the starting value.\n \n- **Row 3**: The Dow Jones Transportation Average also starts at $100.00 in 2004, ...","( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[-0.024903152828901633, -0.015268795615855328, 0.04672331120061683, 0.02142149641936492, 0.02822534632014343, 0.016544517588666624, 0.022192244102338925, 0.03085652265623608, 0.060915717682481205, 0.03144122747389823, 0.03867031741139877, -0.001210939056308401, -0.01686344901319205, -0.04512865966592531, 0.03787299350669821, 0.017899971951947976, 0.005747392933082472, -0.01586679040702595, -0.05076309806873434, 0.007787218999097439, 0.017926550036319728, 0.009401804097067772, 0.008996497157963254, 0.02663069292280671, -0.015189063225385272, 0.01742157574648244, -0.008458301659645177, -0.01280372904745703, -0.00970744554808472, 0.004936777657889535, 0.014444891764137817, -0.020637458762882435, 0.06336084929061679, 0.004059719034412419, 0.00209464231729378, 0.025554303788815756, -0.042524063276849614, 0.028517699660297106, 0.03072363409702252, -0.04619176068905299, 0.01706277998936719, -0.0007736555631709373, 0.008504811910311843, -0.05847058607434562, -0.051321228527315126, -0.02174..."
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe table provides a five-year comparison of cumulative total shareowner returns for three entities: United Parcel Service Inc. (UPS), the S&P 500 Index, and the Dow Jones Transportation Average. It illustrates the value of a hypothetical investment of $100 made on December 31, 2004, and how that investment would have changed over the subsequent years through to December 31, 2009.\n\nThe first row showcases the returns for United Parcel Service Inc., which starts at $100.00 in 2004 and experiences fluctuations in value over the following years, with a decline to $89.49 in 2005, followed by slight recoveries leading to $91.06 in 2006. However, it continues to decline in value, reaching $70.48 by 2008, and slightly recovering to $75.95 in 2009.\n\nThe second row outlines the performance of the S&P 500 index, which initially matches the $100 investment in 2004 and rises in value to $104.91 in 2005, continuing to increase through 2006 and 2007, peaking a...","( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[-0.054680626418250805, -0.005597664000194266, 0.05531484591080831, 0.026830181272808798, 0.030028845883592625, 0.003867351420239812, 0.014766250623282217, 0.038632154167177996, 0.03973514183554172, 0.03146273246016865, 0.05299857217977352, 0.00010803680841954748, -0.0141733948912351, -0.051674983997504836, 0.04056237979284681, 0.02851223737393122, -0.009175481204054729, 0.007072910146329132, -0.04930356106931637, 0.022114908152363565, 0.025727193604445073, 0.003550242372452984, 0.01910926573161759, 0.0354334858311039, -0.007321082464843227, 9.850414714112682e-05, -0.01324964304494338, 0.0006798886408818612, -0.0009814869769692181, 0.0028350236607292308, 0.020074380407097133, -0.021411754026009103, 0.04858661778102835, -0.009306460547294702, 0.012574062585843187, 0.01466973952826329, -0.03399960297981814, 0.031104258953379513, 0.027671210254692574, -0.04935871026647004, 0.01241550771270381, -0.01826823861237895, 0.01782704317250443, -0.038108233068927834, -0.04861419237960519, -0.0..."


### 7. Create ChromdaDB vector store of documents
---

In [17]:
# Create documents for vector store
documents = []

In [18]:
documents = [
    Document(
        page_content=row["expanded_text"],
        metadata={
            "id": row["id"],
            "table": row["table"],
            "table_description": row["table_description"]
        }
    )
    for _, row in df.iterrows()
]

In [19]:
documents

[Document(metadata={'id': 'Single_JKHY/2009/page_28.pdf-3', 'table': '2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247', 'table_description': "### Description of the Table\n\nThe table summarizes the net cash from operating activities for a company over the fiscal years ending June 30 from 2007 to 2009. It includes detailed line items that contribute to the calculation of net cash from operating activities, which are crucial for understanding the company's financial performance and cash flow situation.\n\n1. **Lines Item Breakdown:**\n   - **Net Income:** This row shows the net income for each of the

In [20]:
# Initialise ChromaDB vector store
persist_directory = "vectorstore"

In [21]:
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_directory
)
vectorstore.persist()

  vectorstore.persist()


### 8. Create the Prompt Template
---

In [22]:
FIN_PROMPT_TEMPLATE = """
You are a financial analyst assistant. Use the following pieces of context to answer the question about financial data in tables. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}

Provide a detailed answer with numerical calculations when applicable:
"""

In [23]:
PROMPT = PromptTemplate(
    template=FIN_PROMPT_TEMPLATE,
    input_variables=["context", "question"]
)

### 9. Query the vector store
---

In [24]:
# Initialize QA chain
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

  llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


In [25]:
def query_financial_data(question):
    """
    Query the financial data using the QA chain.
    
    Args:
        question (str): The question to ask about the financial data
        
    Returns:
        Dict: Contains the answer and source documents
    """
    result = qa_chain({"query": question})
    return {
        "answer": result["result"],
        "source_documents": [doc.page_content for doc in result["source_documents"]]
    }

In [32]:
question = "What was the net cash from operating activities in 2008?"

In [33]:
result = query_financial_data(question)

In [34]:
print(result["answer"])

The net cash from operating activities in 2008 was $181,001. 

This figure is taken directly from the provided table summarizing net cash from operating activities for the fiscal years ending June 30 from 2007 to 2009. Here is the relevant portion of the table for clarity:

```markdown
| Line Item                           | 2009         | 2008         | 2007         |
|-------------------------------------|--------------|--------------|--------------|
| **Net Cash from Operating Activities** | **$ 206,588** | **$ 181,001** | **$ 174,247** |
```

Thus, the net cash from operating activities for the fiscal year ended June 30, 2008, was **$181,001**.
