In [1]:
import json
import os
import pandas as pd

from dotenv import load_dotenv
from typing import List, Dict, Any
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.chains import RetrievalQA

In [2]:
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("Issue detected with API key")
    
openai = OpenAI()

API key looks good so far


In [4]:
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

### 1. Load JSON data
---

In [5]:
# Load JSON data
with open('./data/convfinqatrain.json', 'r') as f:
    json_data = json.load(f)
data = json_data[:10]

### 2. Helper Functions for processing financial documents encoded as JSON file
---

In [6]:
def table_to_string(table):
    """Convert table to string representation."""
    return "\n".join(" | ".join(str(cell) for cell in row) for row in table)

In [7]:
def process_json_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
    """Process a single JSON entry to extract id, merged text, and table string."""
    entry_id = entry.get("id", None)
    parts = []

    # Merge pre_text if available
    if "pre_text" in entry and entry["pre_text"]:
        parts.append(" ".join(entry["pre_text"]))

    # Convert table to a string if it exists
    table_str = table_to_string(entry["table"]) if "table" in entry and entry["table"] else ""
    if table_str:
        parts.append(table_str)

    # Merge post_text if available
    if "post_text" in entry and entry["post_text"]:
        parts.append(" ".join(entry["post_text"]))

    # Merge QA details
    if "qa" in entry and entry["qa"]:
        qa_parts = []
        if "question" in entry["qa"]:
            qa_parts.append(f"Question: {entry['qa']['question']}")
        if "answer" in entry["qa"]:
            qa_parts.append(f"Answer: {entry['qa']['answer']}")
        if "exe_ans" in entry["qa"]:
            qa_parts.append(f"Execution Answer: {entry['qa']['exe_ans']}")
        parts.append("\n".join(qa_parts))
    
    # Extract and Merge dialogue break
    dialogue_break = entry.get("annotation", {}).get("dialogue_break", "")
    if dialogue_break:
        parts.append(f"Dialogue Break: {dialogue_break}")

    # Combine all parts into a single text field
    full_text = "\n\n".join(parts)

    return {"id": entry_id, "text": full_text, "possible_questions":dialogue_break, "table": table_str}    

In [8]:
def get_table_description(table_content: str, document_context: str) -> str:
    """
    Given the table content and the document context, use OpenAI's ChatCompletion API
    to generate a detailed description of the table and include the table in markdown format.
    """
    prompt = f"""
    Given the following table and its context from the original document,
    provide a detailed description of the table. Then, include the table in markdown format.

    Original Document Context:
    {document_context}

    Table Content:
    {table_content}

    Please provide:
    1. A comprehensive description of the table.
    2. The table in markdown format.
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes tables and formats them in markdown."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

### 3. Process each JSON entry
---

In [9]:
processed_data = [process_json_entry(entry) for entry in data]

In [10]:
df = pd.DataFrame(processed_data)
df

Unnamed: 0,id,text,possible_questions,table
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % )
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88
5,Single_CE/2010/page_134.pdf-2,"tax returns for 2001 and beyond are open for examination under statute . currently , unrecognized tax benefits are not expected to change significantly over the next 12 months . 19 . stock-based and other management compensation plans in april 2009 , the company approved a global incentive plan which replaces the company 2019s 2004 stock incentive plan . the 2009 global incentive plan ( 201cgip 201d ) enables the compensation committee of the board of directors to award incentive and nonqualified stock options , stock appreciation rights , shares of series a common stock , restricted stock , restricted stock units ( 201crsus 201d ) and incentive bonuses ( which may be paid in cash or stock or a combination thereof ) , any of which may be performance-based , with vesting and other award provisions that provide effective incentive to company employees ( including officers ) , non-management directors and other service providers . under the 2009 gip , the company no longer can grant r...","[how many shares are subject to outstanding awards is under the 2009 global incentive plan?, what about under the 2004 stock incentive plan?, how many total shares are subject to outstanding awards?, what about under the 2004 stock incentive plan?, what proportion does this represent?]",| shares available for awards | shares subject to outstanding awards\n2009 global incentive plan | 2322450 | 2530454\n2004 stock incentive plan | - | 5923147
6,Single_JPM/2013/page_104.pdf-2,"management 2019s discussion and analysis 110 jpmorgan chase & co./2013 annual report 2012 compared with 2011 net loss was $ 2.0 billion , compared with a net income of $ 919 million in the prior year . private equity reported net income of $ 292 million , compared with net income of $ 391 million in the prior year . net revenue was $ 601 million , compared with $ 836 million in the prior year , due to lower unrealized and realized gains on private investments , partially offset by higher unrealized gains on public securities . noninterest expense was $ 145 million , down from $ 238 million in the prior year . treasury and cio reported a net loss of $ 2.1 billion , compared with net income of $ 1.3 billion in the prior year . net revenue was a loss of $ 3.1 billion , compared with net revenue of $ 3.2 billion in the prior year . the current year loss reflected $ 5.8 billion of losses incurred by cio from the synthetic credit portfolio for the six months ended june 30 , 2012 , and $ ...","[what was the net change in value of litigation reserves during 2012?, what was the value of litigation reserves at the start of 2012?, what was the percent change?]",as of or for the year ended december 31 ( in millions ) | 2013 | 2012 | 2011\nsecurities gains | $ 659 | $ 2028 | $ 1385\ninvestment securities portfolio ( average ) | 353712 | 358029 | 330885\ninvestment securities portfolio ( period 2013end ) ( a ) | 347562 | 365421 | 355605\nmortgage loans ( average ) | 5145 | 10241 | 13006\nmortgage loans ( period-end ) | 3779 | 7037 | 13375
7,Double_MAS/2012/page_92.pdf,"masco corporation notes to consolidated financial statements ( continued ) t . other commitments and contingencies litigation . we are subject to claims , charges , litigation and other proceedings in the ordinary course of our business , including those arising from or related to contractual matters , intellectual property , personal injury , environmental matters , product liability , construction defect , insurance coverage , personnel and employment disputes and other matters , including class actions . we believe we have adequate defenses in these matters and that the outcome of these matters is not likely to have a material adverse effect on us . however , there is no assurance that we will prevail in these matters , and we could in the future incur judgments , enter into settlements of claims or revise our expectations regarding the outcome of these matters , which could materially impact our results of operations . in july 2012 , the company reached a settlement agreement r...","[what was the difference in the company 2019s warranty liability between 2011 and 2012?, and the percentage change of this value?]",| 2012 | 2011\nbalance at january 1 | $ 102 | $ 107\naccruals for warranties issued during the year | 42 | 28\naccruals related to pre-existing warranties | 16 | 8\nsettlements made ( in cash or kind ) during the year | -38 ( 38 ) | -38 ( 38 )\nother net ( including currency translation ) | -4 ( 4 ) | -3 ( 3 )\nbalance at december 31 | $ 118 | $ 102
8,Single_HIG/2004/page_122.pdf-2,the following table identifies the company 2019s aggregate contractual obligations due by payment period : payments due by period .\n\n | total | less than 1 year | 1-3 years | 3-5 years | more than 5 years\nproperty and casualty obligations [1] | $ 21885 | $ 5777 | $ 6150 | $ 3016 | $ 6942\nlife annuity and disability obligations [2] | 281998 | 18037 | 37318 | 40255 | 186388\nlong-term debt obligations [3] | 9093 | 536 | 1288 | 1613 | 5656\noperating lease obligations | 723 | 175 | 285 | 162 | 101\npurchase obligations [4] [5] | 1764 | 1614 | 120 | 14 | 16\nother long-term liabilities reflected onthe balance sheet [6] [7] | 1642 | 1590 | 2014 | 52 | 2014\ntotal | $ 317105 | $ 27729 | $ 45161 | $ 45112 | $ 199103\n\n[1] the following points are significant to understanding the cash flows estimated for obligations under property and casualty contracts : reserves for property & casualty unpaid claim and claim adjustment expenses include case reserves for reported claims and reserves ...,"[what is the value of obligations due within 1 year?, what is the amount due between 1-3 years?, what is the sum?, what is the sum divided by total obligations due?]",| total | less than 1 year | 1-3 years | 3-5 years | more than 5 years\nproperty and casualty obligations [1] | $ 21885 | $ 5777 | $ 6150 | $ 3016 | $ 6942\nlife annuity and disability obligations [2] | 281998 | 18037 | 37318 | 40255 | 186388\nlong-term debt obligations [3] | 9093 | 536 | 1288 | 1613 | 5656\noperating lease obligations | 723 | 175 | 285 | 162 | 101\npurchase obligations [4] [5] | 1764 | 1614 | 120 | 14 | 16\nother long-term liabilities reflected onthe balance sheet [6] [7] | 1642 | 1590 | 2014 | 52 | 2014\ntotal | $ 317105 | $ 27729 | $ 45161 | $ 45112 | $ 199103
9,Single_SLG/2013/page_133.pdf-4,"during the years ended december 31 , 2013 , 2012 , and 2011 , we recognized approximately $ 6.5 million , $ 5.1 million and $ 4.7 million of compensation expense , respectively , for these options . as of december 31 , 2013 , there was approximately $ 20.3 million of total unrecognized compensation cost related to unvested stock options , which is expected to be recognized over a weighted average period of three years . stock-based compensation effective january 1 , 1999 , we implemented a deferred compensation plan , or the deferred plan , covering certain of our employees , including our executives . the shares issued under the deferred plan were granted to certain employees , including our executives and vesting will occur annually upon the completion of a service period or our meeting established financial performance criteria . annual vesting occurs at rates ranging from 15% ( 15 % ) to 35% ( 35 % ) once performance criteria are reached . a summary of our restricted stock as o...","[what was the total, in millions, capitalized to assets associated with compensation expense related to long-term compensation plans , restricted stock and stock options in the year of 2013?, and what was it in 2012, also in millions?, what was, then, in millions, the total sum that was capitalized in those two years?, including the year of 2011, what would then be the total sum capitalized in the three years, in millions?]",| 2013 | 2012 | 2011\nbalance at beginning of year | 2804901 | 2912456 | 2728290\ngranted | 192563 | 92729 | 185333\ncancelled | -3267 ( 3267 ) | -200284 ( 200284 ) | -1167 ( 1167 )\nbalance at end of year | 2994197 | 2804901 | 2912456\nvested during the year | 21074 | 408800 | 66299\ncompensation expense recorded | $ 6713155 | $ 6930381 | $ 17365401\nweighted average fair value of restricted stock granted during the year | $ 17386949 | $ 7023942 | $ 21768084


### 4. Generate contextualised table description
---

In [11]:
def safe_get_table_description(row):
    table_content = row["table"]
    document_context = row["text"]
    if table_content and table_content.strip():
        try:
            return get_table_description(table_content, document_context)
        except Exception as e:
            print(f"Error processing table description for id {row['id']}: {e}")
            return ""
    else:
        return ""

In [12]:
df["table_description"] = df.apply(safe_get_table_description, axis=1)

In [13]:
df.head()

Unnamed: 0,id,text,possible_questions,table,table_description
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"### Table Description\n\nThe table summarizes the net cash from operating activities and related components across three fiscal years ending on June 30: 2009, 2008, and 2007. The key categories included in the table are as follows:\n\n- **Net Income**: This row displays the net income figures for each of the three years. It shows a slight decrease from 2007 to 2008 and a further decrease in 2009.\n \n- **Non-Cash Expenses**: This component indicates the non-cash expenses incurred during each fiscal year. There is a notable increase in non-cash expenses from 2007 to 2008, but a smaller increase from 2008 to 2009.\n\n- **Change in Receivables**: This line shows the change in receivables for the years. In 2008, there was a negative change indicating an increase in receivables, while 2009 shows a recovery with a positive change in receivables.\n\n- **Change in Deferred Revenue**: This row represents changes in deferred revenue, which is an important metric reflecting cash already coll..."
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"### Comprehensive Description of the Table\n\nThe table presents unaudited pro forma financial information for a company for the years ended December 31, 2008, and December 31, 2007. This information reflects the financial performance of the company, including the impact of a merger that was completed during this period. All values are represented in millions of dollars, except for share and per share amounts.\n\n- **Revenue**: The table shows total revenue of $9,362.2 million for 2008, representing a slight increase from $9,244.9 million in 2007.\n \n- **Income from Continuing Operations**: This metric indicates income available to common stockholders. For 2008, it is recorded at $285.7 million, a decrease from $423.2 million in 2007.\n\n- **Basic Earnings Per Share (EPS)**: For 2008, the basic EPS is $0.76, down from $1.10 in 2007. This indicates a decline in profitability per share for shareholders.\n\n- **Diluted Earnings Per Share (EPS)**: The diluted EPS for 2008 is $0.75, a..."
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"### Table Description\n\nThe table presents the financial performance metrics of a company for three consecutive fiscal years, specifically focusing on net sales, cost of sales, gross margin, and gross margin percentage from the years 2000 to 2002. The data is structured into four rows, each representing a different financial metric, and four columns, one for each fiscal year. \n\n1. **Net Sales**: This row indicates the total sales revenue generated by the company. In 2000, the net sales were $7,983 million, which decreased to $5,363 million in 2001, representing a significant reduction. The sales increased slightly to $5,742 million in 2002.\n \n2. **Cost of Sales**: This row reflects the costs directly associated with the production of the goods sold by the company. The cost increased marginally from $5,817 million in 2000 to $4,128 million in 2001, and then again to $4,139 million in 2002. The data shows a relatively stable cost of sales compared to the variation in net sales..."
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Table Description\n\nThe table presented provides a comparative summary of the cumulative total returns for three investment portfolios over a five-year period ending December 31, 2009. The table includes the performance data for:\n\n1. **United Parcel Service Inc. (UPS)**: A company that has its shares publicly traded.\n2. **S&P 500 Index**: A stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States.\n3. **Dow Jones Transportation Average**: An index that represents the transportation sector and comprises 20 transportation-related stocks.\n\nEach row in the table displays the value of an investment of $100 made on December 31, 2004, for each of these entities across six fiscal checkpoints: the initial investment date and each subsequent year until December 31, 2009. \n\nThe values reveal the fluctuations in stock value over the years, illustrating performance differences among the three investment options. For..."
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe provided table presents a comparative analysis of the cumulative total returns for United Parcel Service Inc. (UPS), the S&P 500 index, and the Dow Jones Transportation Average over a span of five years, from December 31, 2004, to December 31, 2009. The table shows how an initial investment of $100 in each of these three entities would have performed by the end of each year. Each column represents the cumulative return at the end of the respective year, with the first column indicating the starting value of the investment made on December 31, 2004.\n\n- **Rows**: \n - The first row lists the performance of United Parcel Service Inc. (UPS).\n - The second row represents the performance of the S&P 500 index.\n - The third row shows the performance of the Dow Jones Transportation Average.\n\n- **Columns**: \n - The first column, marked ""12/31/04,"" indicates the starting investment value of $100 for all three entities.\n - The subsequent columns..."


### 5. Append contextualised table field to text field
---

In [14]:
df["expanded_text"] = df["text"] + "\n\n" + df["table_description"]

### 6. Run embedding model
---

In [15]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
df["embedding"] = df["expanded_text"].apply(lambda x: embedding_model.embed_query(x) if x else [])
df.head()

  embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


Unnamed: 0,id,text,possible_questions,table,table_description,expanded_text,embedding
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[what is the net cash from operating activities in 2009?, what about in 2008?, what is the difference?, what percentage change does this represent?]",2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247,"### Table Description\n\nThe table summarizes the net cash from operating activities and related components across three fiscal years ending on June 30: 2009, 2008, and 2007. The key categories included in the table are as follows:\n\n- **Net Income**: This row displays the net income figures for each of the three years. It shows a slight decrease from 2007 to 2008 and a further decrease in 2009.\n \n- **Non-Cash Expenses**: This component indicates the non-cash expenses incurred during each fiscal year. There is a notable increase in non-cash expenses from 2007 to 2008, but a smaller increase from 2008 to 2009.\n\n- **Change in Receivables**: This line shows the change in receivables for the years. In 2008, there was a negative change indicating an increase in receivables, while 2009 shows a recovery with a positive change in receivables.\n\n- **Change in Deferred Revenue**: This row represents changes in deferred revenue, which is an important metric reflecting cash already coll...","26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","[-0.037696415615861276, 0.025408706990644025, 0.05983810059034961, 0.015620865723410093, 0.05277498335232198, -0.009040526574879134, -0.022313633207552544, 0.015488597421981031, 0.004771572780758, 0.0072945872311898, 0.003541479067583912, -0.016507062039133144, -0.02444314894900544, -0.028622822431286186, 0.03912491178117896, 0.03560657980604352, -0.003977963903506246, -0.000416231275549074, -0.04211417055059814, -0.015713454093204012, 0.03989206830199657, 0.019231786068339442, 0.018729166150379957, 0.0072218397585360776, 0.012995343385936902, 0.019707950835896923, -0.0621131131397552, -0.026308129950245455, 0.0010994788586451505, -0.01937728101364689, 0.0353420432031854, -0.0219565082349005, 0.035738846244827344, 0.01915242434242391, -0.0013648417774996915, 0.00016957187608254687, -0.029098987198843667, 0.014311411215643093, 0.037696415615861276, -0.04158509734488189, 0.01838526968425154, -0.002799124133852854, -0.006977143587156836, -0.02696947145739077, -0.07084280613291138, -0...."
1,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[what were revenues in 2008?, what were they in 2007?, what was the net change?, what is the percent change?]",| year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244.9\nincome from continuing operations available to common stockholders | 285.7 | 423.2\nbasic earnings per share | .76 | 1.10\ndiluted earnings per share | .75 | 1.09,"### Comprehensive Description of the Table\n\nThe table presents unaudited pro forma financial information for a company for the years ended December 31, 2008, and December 31, 2007. This information reflects the financial performance of the company, including the impact of a merger that was completed during this period. All values are represented in millions of dollars, except for share and per share amounts.\n\n- **Revenue**: The table shows total revenue of $9,362.2 million for 2008, representing a slight increase from $9,244.9 million in 2007.\n \n- **Income from Continuing Operations**: This metric indicates income available to common stockholders. For 2008, it is recorded at $285.7 million, a decrease from $423.2 million in 2007.\n\n- **Basic Earnings Per Share (EPS)**: For 2008, the basic EPS is $0.76, down from $1.10 in 2007. This indicates a decline in profitability per share for shareholders.\n\n- **Diluted Earnings Per Share (EPS)**: The diluted EPS for 2008 is $0.75, a...","substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .\n\n | year ended december 31 2008 ( unaudited ) | year ended december 31 2007 ( unaudited )\nrevenue | $ 9362.2 | $ 9244....","[-0.020952606811662527, -0.0020229623417083948, 0.0729100637967805, 0.025026338459166707, 0.012450602978310112, 0.02010449231152134, 0.00047880251620786347, 0.028474411319228815, -0.01683716495085791, 0.017963350023195573, 0.0031717408343015377, -0.026110813318203932, 0.018005060236496014, -0.005418897202314308, 0.03667748761108936, 6.544431090990021e-06, -0.0017457609263145492, -0.028265858390081455, -0.052666538061806094, -0.015196548947427734, 0.05110934434762196, 0.023288397382938728, -0.0013182276311100478, -0.0011174954833718456, -0.010281654191558248, 0.02704234824494599, -0.056225840640862924, -0.03481441775760524, -0.024637040030620675, -0.011400887872120033, 0.03270108289902816, -0.027528971746289828, 0.040403631043347946, 0.015307776803777292, -0.02708405845824643, -0.00740362525944085, -0.016725935231863186, -0.0019586584753218303, 0.014417951159013082, -0.02474826788697023, -0.017518436735152177, -0.020034974668472223, -0.031449765324144015, -0.0065902691151629314, -0...."
2,Single_AAPL/2002/page_23.pdf-1,"in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[what was the total of net sales in 2001?, and what was that in 2000?, what was, then, the change in the total of net sales over the year?, and how much does this change represent in relation to that total in 2000, in percentage?]",| 2002 | 2001 | 2000\nnet sales | $ 5742 | $ 5363 | $ 7983\ncost of sales | 4139 | 4128 | 5817\ngross margin | $ 1603 | $ 1235 | $ 2166\ngross margin percentage | 28% ( 28 % ) | 23% ( 23 % ) | 27% ( 27 % ),"### Table Description\n\nThe table presents the financial performance metrics of a company for three consecutive fiscal years, specifically focusing on net sales, cost of sales, gross margin, and gross margin percentage from the years 2000 to 2002. The data is structured into four rows, each representing a different financial metric, and four columns, one for each fiscal year. \n\n1. **Net Sales**: This row indicates the total sales revenue generated by the company. In 2000, the net sales were $7,983 million, which decreased to $5,363 million in 2001, representing a significant reduction. The sales increased slightly to $5,742 million in 2002.\n \n2. **Cost of Sales**: This row reflects the costs directly associated with the production of the goods sold by the company. The cost increased marginally from $5,817 million in 2000 to $4,128 million in 2001, and then again to $4,139 million in 2002. The data shows a relatively stable cost of sales compared to the variation in net sales...","in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment's operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading ""factors that may affect future results and financial condition."" backlog in the company's experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company's ability to a...","[-0.03980907336017688, 0.011629993766529003, 0.06506891402724953, -0.014884781689955918, 0.03997013628261926, 0.022307039669628582, -0.02068300126307622, 0.019327399818831074, 0.0042949774270771655, 0.04021172880363771, 0.052345040011847636, -0.001942806251554309, -0.0320244275302991, -0.004902314284784392, 0.013327852523980362, 0.021434623267420343, 0.016173274824946814, -0.01091863842411803, -0.049472774200914976, -0.02581013122439614, 0.06168661500770098, 0.029715876359979417, -0.01621354148687996, 0.0048184281997246635, -0.022790224711665485, 0.04214446897178532, -0.0025652425254098694, -0.04294977613341676, 0.009126827847446211, -0.023098926473260742, 0.010401898761792722, -0.024682700080525064, 0.01958241456049391, 0.0032698872231599293, -0.0004836044282243819, -0.015421653751925245, -0.011321292936256302, -0.0017137967131440075, 0.047191067471884056, -0.027219421551218814, -0.015596137963689447, -0.008032950392540985, -0.007220931189264803, 0.020360879143481687, -0.011885008..."
3,Single_UPS/2009/page_33.pdf-2,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the change in the performance of the united parcel service inc . from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what was the performance value of the s&p 500 index in 2009?, what was, then, the change in that performance from 2004 to 2009?, and how much does this change represent in relation to that performance in 2004, in percentage?, what is, then, the difference between the percent representation of the united parcel service inc . and the s&p 500 index?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Table Description\n\nThe table presented provides a comparative summary of the cumulative total returns for three investment portfolios over a five-year period ending December 31, 2009. The table includes the performance data for:\n\n1. **United Parcel Service Inc. (UPS)**: A company that has its shares publicly traded.\n2. **S&P 500 Index**: A stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the United States.\n3. **Dow Jones Transportation Average**: An index that represents the transportation sector and comprises 20 transportation-related stocks.\n\nEach row in the table displays the value of an investment of $100 made on December 31, 2004, for each of these entities across six fiscal checkpoints: the initial investment date and each subsequent year until December 31, 2009. \n\nThe values reveal the fluctuations in stock value over the years, illustrating performance differences among the three investment options. For...","( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[-0.02344739683898556, -0.018569079163324363, 0.047838983354646494, 0.01776914016985077, 0.027486432732440818, 0.015343095284490117, 0.025598053379090074, 0.031263197027077444, 0.056808792733642706, 0.02950595161049097, 0.04041659615271769, -0.002680123872760353, -0.01438579072402228, -0.04146569744915259, 0.03889540001794511, 0.014097288426296197, 0.003671851743554576, -0.015684052460773437, -0.05465813805618324, -0.00047947172764031614, 0.012903936446659518, 0.014136629352383495, 0.007291248608551911, 0.021926200704212976, -0.016077465446936525, 0.017874049926965254, -0.012084326990142277, -0.021939313725360392, -0.00852394151427589, 0.003658738023915266, 0.009494360027213667, -0.021322968203820927, 0.05948399805931962, 0.0020801696276551594, -0.0014695603543757962, 0.023788354015268885, -0.041229651892628796, 0.02919122233914753, 0.0329155270986143, -0.04479659108510132, 0.012149894889846934, -0.0018572358117103112, 0.012425284166425599, -0.05376640046409754, -0.0511174211807154..."
4,Double_UPS/2009/page_33.pdf,"( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[what was the fluctuation of the performance price of the ups from 2004 to 2006?, and how much does this fluctuation represent in relation to that price in 2004?, and from this year to 2009, what was the fluctuation for that stock?, what is this fluctuation as a percentage of the 2004 price?, and for the s&p 500 index price, what was the fluctuation in those five years?, and what percentage does this fluctuation represent in relation to the 2004 price of this stock?, what is, then, the difference between the ups percentage and this s&p 500 index one, for this five year period?]",| 12/31/04 | 12/31/05 | 12/31/06 | 12/31/07 | 12/31/08 | 12/31/09\nunited parcel service inc . | $ 100.00 | $ 89.49 | $ 91.06 | $ 87.88 | $ 70.48 | $ 75.95\ns&p 500 index | $ 100.00 | $ 104.91 | $ 121.48 | $ 128.15 | $ 80.74 | $ 102.11\ndow jones transportation average | $ 100.00 | $ 111.65 | $ 122.61 | $ 124.35 | $ 97.72 | $ 115.88,"### Description of the Table\n\nThe provided table presents a comparative analysis of the cumulative total returns for United Parcel Service Inc. (UPS), the S&P 500 index, and the Dow Jones Transportation Average over a span of five years, from December 31, 2004, to December 31, 2009. The table shows how an initial investment of $100 in each of these three entities would have performed by the end of each year. Each column represents the cumulative return at the end of the respective year, with the first column indicating the starting value of the investment made on December 31, 2004.\n\n- **Rows**: \n - The first row lists the performance of United Parcel Service Inc. (UPS).\n - The second row represents the performance of the S&P 500 index.\n - The third row shows the performance of the Dow Jones Transportation Average.\n\n- **Columns**: \n - The first column, marked ""12/31/04,"" indicates the starting investment value of $100 for all three entities.\n - The subsequent columns...","( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options . shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing . the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average . the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested divide...","[-0.04435098126412862, -0.011345599663337482, 0.05537922251882479, 0.028641689279304015, 0.03533268311675202, 0.010532366005434092, 0.020456461029147083, 0.03713105208899894, 0.04736589592510799, 0.04014596674997831, 0.05532632712241149, -0.003137230142911013, -0.018142381499644387, -0.05265521949960264, 0.04141540783512406, 0.023273024472834542, -0.012925784904266473, 0.003963686951425803, -0.0543213555687517, 0.013937368848139207, 0.02053580039847677, 0.0011611719265578347, 0.01627789421320342, 0.027292910687473703, -0.004512454337472203, -0.005345522837708014, -0.01244974589432066, 0.00823151032503472, 0.003871123965823434, -0.007200092004490847, 0.01349438806396786, -0.023021781584419594, 0.03982860927265956, 0.005550484116906456, 0.01967628466569559, 0.01853908020893795, -0.04818573917925663, 0.033560759980066635, 0.03334918584499396, -0.05257587826762783, 0.013606785659072008, -0.012337348299216512, 0.014426630775865779, -0.04773614879884003, -0.059293321665927615, -0.0116761..."


### 7. Create FAISS Vector Store of documents
---

In [16]:
documents = []
for _, row in df.iterrows():
    doc = Document(
        page_content=row["expanded_text"],
        metadata={
            "id": row["id"],
            "table": row["table"],
            "table_description": row["table_description"]
        }
    )
    documents.append(doc)

In [17]:
documents

[Document(metadata={'id': 'Single_JKHY/2009/page_28.pdf-3', 'table': '2008 | year ended june 30 2009 2008 | year ended june 30 2009 2008 | year ended june 30 2009\nnet income | $ 103102 | $ 104222 | $ 104681\nnon-cash expenses | 74397 | 70420 | 56348\nchange in receivables | 21214 | -2913 ( 2913 ) | -28853 ( 28853 )\nchange in deferred revenue | 21943 | 5100 | 24576\nchange in other assets and liabilities | -14068 ( 14068 ) | 4172 | 17495\nnet cash from operating activities | $ 206588 | $ 181001 | $ 174247', 'table_description': "### Table Description\n\nThe table summarizes the net cash from operating activities and related components across three fiscal years ending on June 30: 2009, 2008, and 2007. The key categories included in the table are as follows:\n\n- **Net Income**: This row displays the net income figures for each of the three years. It shows a slight decrease from 2007 to 2008 and a further decrease in 2009.\n  \n- **Non-Cash Expenses**: This component indicates the non-c

In [18]:
vectorstore = FAISS.from_documents(documents, embedding_model)

### 9. Create the retrieval QA chain using multimodal GPT-4
---

In [19]:
qabot_contextualized = RetrievalQA.from_chain_type(
    chain_type="stuff",
    llm=ChatOpenAI(model="gpt-4o-mini"),
    retriever=vectorstore.as_retriever(search_kwargs=dict(k=5, index='flat_index')),
    return_source_documents=True,
)

  llm=ChatOpenAI(model="gpt-4o-mini"),


### 10. Ask a question based on table numbers
---

In [20]:
query = "What was the change in net sales from 2008 to 2009?"
print("\nQuery:", query)
result = qabot_contextualized.invoke(dict(query=query))["result"]
print("Answer:", result)


Query: What was the change in net sales from 2008 to 2009?


: 