In [1]:
import os
import json
import pandas as pd
import ast
import numpy as np
import faiss
import openai
 
from typing import List
from IPython.display import Markdown, display, update_display
from dotenv import load_dotenv
from convfinqaloader import convfinqadfloader
from transformers import pipeline
from openai import OpenAI

In [2]:
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
openai = OpenAI()

API key looks good so far


In [4]:
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

### 1. Load and Flattion ConvFinQA JSON data
---

In [5]:
df = convfinqadfloader("data/convfinqatrain.json")

### 2. Combine relevant text fields for retrieval
---

In [6]:
def create_combined_text(row):
    """
    Combine key text fields to form a context string.
    Uses 'pre_text', 'dialogue_text', 'post_text', and 'execution_answer'.
    """
    texts = []
    if pd.notnull(row.get('pre_text')):
        texts.append("Pre-Text: " + row['pre_text'])
    if pd.notnull(row.get('dialogue_text')):
        texts.append("Dialogue: " + row['dialogue_text'])
    if pd.notnull(row.get('post_text')):
        texts.append("Post-Text: " + row['post_text'])
    if pd.notnull(row.get('execution_answer')):
        texts.append("Execution Answer: " + str(row['execution_answer']))
    return " | ".join(texts)

In [7]:
# Create a new column 'combined_text'
df['combined_text'] = df.apply(create_combined_text, axis=1)

### 3. Chunking Documents
---

In [8]:
def chunk_text(text, chunk_size=200, chunk_overlap=100):
    """
    Split a text into chunks of words with a specified chunk size and overlap.
    chunk_size and chunk_overlap are measured in words.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = " ".join(words[start: start + chunk_size])
        chunks.append(chunk)
        if start + chunk_size >= len(words):
            break
        start += (chunk_size - chunk_overlap)
    return chunks


In [27]:
# Only use the first 100 documents in the dataframe
subset_df = df.iloc[:500].copy()

In [48]:
subset_df.head(10)

Unnamed: 0,id,pre_text,post_text,filename,table,dialogue_text,turn_program,qa_split,execution_answer,turn_index,question,answer,explanation,program,combined_text
0,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year ended june 30 , 2009 as compared to $ 181001 for the fiscal year ended june 30 , 2008 . this increase is primarily attributable to a decrease in receivables compared to the same period a year ago of $ 21214 . this decrease is largely the result of fiscal 2010 annual software maintenance billings being provided to customers earlier than in the prior year , which allowed more cash to be collected before the end of the fiscal year than in previous years . further , we collected more cash overall related to revenues that will be recognized in subsequent periods in the current year than in fiscal 2008 . cash used in investing activities for the fiscal year ended june 2009 was $ 59227 and includes $ 3027 in contingent consideration paid on prior years 2019 acquisitions . cash used in investing activities for the fiscal year ended june 2008 was $ 102148 and includes payments for acquisitions...",JKHY/2009/page_28.pdf,"[['2008', 'year ended june 30 2009 2008', 'year ended june 30 2009 2008', 'year ended june 30 2009'], ['net income', '$ 103102', '$ 104222', '$ 104681'], ['non-cash expenses', '74397', '70420', '56348'], ['change in receivables', '21214', '-2913 ( 2913 )', '-28853 ( 28853 )'], ['change in deferred revenue', '21943', '5100', '24576'], ['change in other assets and liabilities', '-14068 ( 14068 )', '4172', '17495'], ['net cash from operating activities', '$ 206588', '$ 181001', '$ 174247']]",what is the net cash from operating activities in 2009?,206588,0.0,206588.0,0,,,,,"Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings..."
1,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year ended june 30 , 2009 as compared to $ 181001 for the fiscal year ended june 30 , 2008 . this increase is primarily attributable to a decrease in receivables compared to the same period a year ago of $ 21214 . this decrease is largely the result of fiscal 2010 annual software maintenance billings being provided to customers earlier than in the prior year , which allowed more cash to be collected before the end of the fiscal year than in previous years . further , we collected more cash overall related to revenues that will be recognized in subsequent periods in the current year than in fiscal 2008 . cash used in investing activities for the fiscal year ended june 2009 was $ 59227 and includes $ 3027 in contingent consideration paid on prior years 2019 acquisitions . cash used in investing activities for the fiscal year ended june 2008 was $ 102148 and includes payments for acquisitions...",JKHY/2009/page_28.pdf,"[['2008', 'year ended june 30 2009 2008', 'year ended june 30 2009 2008', 'year ended june 30 2009'], ['net income', '$ 103102', '$ 104222', '$ 104681'], ['non-cash expenses', '74397', '70420', '56348'], ['change in receivables', '21214', '-2913 ( 2913 )', '-28853 ( 28853 )'], ['change in deferred revenue', '21943', '5100', '24576'], ['change in other assets and liabilities', '-14068 ( 14068 )', '4172', '17495'], ['net cash from operating activities', '$ 206588', '$ 181001', '$ 174247']]",what about in 2008?,181001,0.0,181001.0,1,,,,,"Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings..."
2,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year ended june 30 , 2009 as compared to $ 181001 for the fiscal year ended june 30 , 2008 . this increase is primarily attributable to a decrease in receivables compared to the same period a year ago of $ 21214 . this decrease is largely the result of fiscal 2010 annual software maintenance billings being provided to customers earlier than in the prior year , which allowed more cash to be collected before the end of the fiscal year than in previous years . further , we collected more cash overall related to revenues that will be recognized in subsequent periods in the current year than in fiscal 2008 . cash used in investing activities for the fiscal year ended june 2009 was $ 59227 and includes $ 3027 in contingent consideration paid on prior years 2019 acquisitions . cash used in investing activities for the fiscal year ended june 2008 was $ 102148 and includes payments for acquisitions...",JKHY/2009/page_28.pdf,"[['2008', 'year ended june 30 2009 2008', 'year ended june 30 2009 2008', 'year ended june 30 2009'], ['net income', '$ 103102', '$ 104222', '$ 104681'], ['non-cash expenses', '74397', '70420', '56348'], ['change in receivables', '21214', '-2913 ( 2913 )', '-28853 ( 28853 )'], ['change in deferred revenue', '21943', '5100', '24576'], ['change in other assets and liabilities', '-14068 ( 14068 )', '4172', '17495'], ['net cash from operating activities', '$ 206588', '$ 181001', '$ 174247']]",what is the difference?,"subtract(206588, 181001)",0.0,25587.0,2,,,,,"Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings..."
3,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year ended june 30 , 2009 as compared to $ 181001 for the fiscal year ended june 30 , 2008 . this increase is primarily attributable to a decrease in receivables compared to the same period a year ago of $ 21214 . this decrease is largely the result of fiscal 2010 annual software maintenance billings being provided to customers earlier than in the prior year , which allowed more cash to be collected before the end of the fiscal year than in previous years . further , we collected more cash overall related to revenues that will be recognized in subsequent periods in the current year than in fiscal 2008 . cash used in investing activities for the fiscal year ended june 2009 was $ 59227 and includes $ 3027 in contingent consideration paid on prior years 2019 acquisitions . cash used in investing activities for the fiscal year ended june 2008 was $ 102148 and includes payments for acquisitions...",JKHY/2009/page_28.pdf,"[['2008', 'year ended june 30 2009 2008', 'year ended june 30 2009 2008', 'year ended june 30 2009'], ['net income', '$ 103102', '$ 104222', '$ 104681'], ['non-cash expenses', '74397', '70420', '56348'], ['change in receivables', '21214', '-2913 ( 2913 )', '-28853 ( 28853 )'], ['change in deferred revenue', '21943', '5100', '24576'], ['change in other assets and liabilities', '-14068 ( 14068 )', '4172', '17495'], ['net cash from operating activities', '$ 206588', '$ 181001', '$ 174247']]",what percentage change does this represent?,"subtract(206588, 181001), divide(#0, 181001)",0.0,0.14136,3,,,,,"Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings..."
4,Single_JKHY/2009/page_28.pdf-3,"26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings on our re...","year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year ended june 30 , 2009 as compared to $ 181001 for the fiscal year ended june 30 , 2008 . this increase is primarily attributable to a decrease in receivables compared to the same period a year ago of $ 21214 . this decrease is largely the result of fiscal 2010 annual software maintenance billings being provided to customers earlier than in the prior year , which allowed more cash to be collected before the end of the fiscal year than in previous years . further , we collected more cash overall related to revenues that will be recognized in subsequent periods in the current year than in fiscal 2008 . cash used in investing activities for the fiscal year ended june 2009 was $ 59227 and includes $ 3027 in contingent consideration paid on prior years 2019 acquisitions . cash used in investing activities for the fiscal year ended june 2008 was $ 102148 and includes payments for acquisitions...",JKHY/2009/page_28.pdf,"[['2008', 'year ended june 30 2009 2008', 'year ended june 30 2009 2008', 'year ended june 30 2009'], ['net income', '$ 103102', '$ 104222', '$ 104681'], ['non-cash expenses', '74397', '70420', '56348'], ['change in receivables', '21214', '-2913 ( 2913 )', '-28853 ( 28853 )'], ['change in deferred revenue', '21943', '5100', '24576'], ['change in other assets and liabilities', '-14068 ( 14068 )', '4172', '17495'], ['net cash from operating activities', '$ 206588', '$ 181001', '$ 174247']]",,,,0.14136,0,what was the percentage change in the net cash from operating activities from 2008 to 2009,14.1%,,"subtract(206588, 181001), divide(#0, 181001)","Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year . support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support . gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins . liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations and short-term borrowings..."
5,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .","the above unaudited pro forma financial information includes adjustments for amortization of identifiable intangible assets , accretion of discounts to fair value associated with debt , environmental , self-insurance and other liabilities , accretion of capping , closure and post-closure obligations and amortization of the related assets , and provision for income taxes . assets held for sale as a condition of the merger with allied in december 2008 , we reached a settlement with the doj requiring us to divest of certain operations serving fifteen metropolitan areas including los angeles , ca ; san francisco , ca ; denver , co ; atlanta , ga ; northwestern indiana ; lexington , ky ; flint , mi ; cape girardeau , mo ; charlotte , nc ; cleveland , oh ; philadelphia , pa ; greenville-spartanburg , sc ; and fort worth , houston and lubbock , tx . the settlement requires us to divest 87 commercial waste collection routes , nine landfills and ten transfer stations , together with ancilla...",RSG/2008/page_114.pdf,"[['', 'year ended december 31 2008 ( unaudited )', 'year ended december 31 2007 ( unaudited )'], ['revenue', '$ 9362.2', '$ 9244.9'], ['income from continuing operations available to common stockholders', '285.7', '423.2'], ['basic earnings per share', '.76', '1.10'], ['diluted earnings per share', '.75', '1.09']]",what were revenues in 2008?,9362.2,0.0,9362.2,0,,,,,"Pre-Text: substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) . | Dialogue: what were revenues in 2008? | Post-Text: the above unaudited pro forma financial information includ..."
6,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .","the above unaudited pro forma financial information includes adjustments for amortization of identifiable intangible assets , accretion of discounts to fair value associated with debt , environmental , self-insurance and other liabilities , accretion of capping , closure and post-closure obligations and amortization of the related assets , and provision for income taxes . assets held for sale as a condition of the merger with allied in december 2008 , we reached a settlement with the doj requiring us to divest of certain operations serving fifteen metropolitan areas including los angeles , ca ; san francisco , ca ; denver , co ; atlanta , ga ; northwestern indiana ; lexington , ky ; flint , mi ; cape girardeau , mo ; charlotte , nc ; cleveland , oh ; philadelphia , pa ; greenville-spartanburg , sc ; and fort worth , houston and lubbock , tx . the settlement requires us to divest 87 commercial waste collection routes , nine landfills and ten transfer stations , together with ancilla...",RSG/2008/page_114.pdf,"[['', 'year ended december 31 2008 ( unaudited )', 'year ended december 31 2007 ( unaudited )'], ['revenue', '$ 9362.2', '$ 9244.9'], ['income from continuing operations available to common stockholders', '285.7', '423.2'], ['basic earnings per share', '.76', '1.10'], ['diluted earnings per share', '.75', '1.09']]",what were they in 2007?,9244.9,0.0,9244.9,1,,,,,"Pre-Text: substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) . | Dialogue: what were they in 2007? | Post-Text: the above unaudited pro forma financial information includes a..."
7,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .","the above unaudited pro forma financial information includes adjustments for amortization of identifiable intangible assets , accretion of discounts to fair value associated with debt , environmental , self-insurance and other liabilities , accretion of capping , closure and post-closure obligations and amortization of the related assets , and provision for income taxes . assets held for sale as a condition of the merger with allied in december 2008 , we reached a settlement with the doj requiring us to divest of certain operations serving fifteen metropolitan areas including los angeles , ca ; san francisco , ca ; denver , co ; atlanta , ga ; northwestern indiana ; lexington , ky ; flint , mi ; cape girardeau , mo ; charlotte , nc ; cleveland , oh ; philadelphia , pa ; greenville-spartanburg , sc ; and fort worth , houston and lubbock , tx . the settlement requires us to divest 87 commercial waste collection routes , nine landfills and ten transfer stations , together with ancilla...",RSG/2008/page_114.pdf,"[['', 'year ended december 31 2008 ( unaudited )', 'year ended december 31 2007 ( unaudited )'], ['revenue', '$ 9362.2', '$ 9244.9'], ['income from continuing operations available to common stockholders', '285.7', '423.2'], ['basic earnings per share', '.76', '1.10'], ['diluted earnings per share', '.75', '1.09']]",what was the net change?,"subtract(9362.2, 9244.9)",0.0,117.3,2,,,,,"Pre-Text: substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) . | Dialogue: what was the net change? | Post-Text: the above unaudited pro forma financial information includes ..."
8,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .","the above unaudited pro forma financial information includes adjustments for amortization of identifiable intangible assets , accretion of discounts to fair value associated with debt , environmental , self-insurance and other liabilities , accretion of capping , closure and post-closure obligations and amortization of the related assets , and provision for income taxes . assets held for sale as a condition of the merger with allied in december 2008 , we reached a settlement with the doj requiring us to divest of certain operations serving fifteen metropolitan areas including los angeles , ca ; san francisco , ca ; denver , co ; atlanta , ga ; northwestern indiana ; lexington , ky ; flint , mi ; cape girardeau , mo ; charlotte , nc ; cleveland , oh ; philadelphia , pa ; greenville-spartanburg , sc ; and fort worth , houston and lubbock , tx . the settlement requires us to divest 87 commercial waste collection routes , nine landfills and ten transfer stations , together with ancilla...",RSG/2008/page_114.pdf,"[['', 'year ended december 31 2008 ( unaudited )', 'year ended december 31 2007 ( unaudited )'], ['revenue', '$ 9362.2', '$ 9244.9'], ['income from continuing operations available to common stockholders', '285.7', '423.2'], ['basic earnings per share', '.76', '1.10'], ['diluted earnings per share', '.75', '1.09']]",what is the percent change?,"subtract(9362.2, 9244.9), divide(#0, 9244.9)",0.0,0.01269,3,,,,,"Pre-Text: substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) . | Dialogue: what is the percent change? | Post-Text: the above unaudited pro forma financial information includ..."
9,Single_RSG/2008/page_114.pdf-2,"substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) .","the above unaudited pro forma financial information includes adjustments for amortization of identifiable intangible assets , accretion of discounts to fair value associated with debt , environmental , self-insurance and other liabilities , accretion of capping , closure and post-closure obligations and amortization of the related assets , and provision for income taxes . assets held for sale as a condition of the merger with allied in december 2008 , we reached a settlement with the doj requiring us to divest of certain operations serving fifteen metropolitan areas including los angeles , ca ; san francisco , ca ; denver , co ; atlanta , ga ; northwestern indiana ; lexington , ky ; flint , mi ; cape girardeau , mo ; charlotte , nc ; cleveland , oh ; philadelphia , pa ; greenville-spartanburg , sc ; and fort worth , houston and lubbock , tx . the settlement requires us to divest 87 commercial waste collection routes , nine landfills and ten transfer stations , together with ancilla...",RSG/2008/page_114.pdf,"[['', 'year ended december 31 2008 ( unaudited )', 'year ended december 31 2007 ( unaudited )'], ['revenue', '$ 9362.2', '$ 9244.9'], ['income from continuing operations available to common stockholders', '285.7', '423.2'], ['basic earnings per share', '.76', '1.10'], ['diluted earnings per share', '.75', '1.09']]",,,,0.01269,0,what was the percent of the growth in the revenues from 2007 to 2008,1.3%,the percent growth of the revenue is the difference between the 2 divide by the oldest amount,"subtract(9362.2, 9244.9), divide(#0, 9244.9)","Pre-Text: substantially all of the goodwill and other intangible assets recorded related to the acquisition of allied are not deductible for tax purposes . pro forma information the consolidated financial statements presented for republic include the operating results of allied from the date of the acquisition . the following pro forma information is presented assuming the merger had been completed as of january 1 , 2007 . the unaudited pro forma information presented below has been prepared for illustrative purposes and is not intended to be indicative of the results of operations that would have actually occurred had the acquisition been consummated at the beginning of the periods presented or of future results of the combined operations ( in millions , except share and per share amounts ) . year ended december 31 , year ended december 31 , ( unaudited ) ( unaudited ) . | Post-Text: the above unaudited pro forma financial information includes adjustments for amortization of ident..."


In [28]:
# Create chunks for each document
chunk_records = []
for idx, row in subset_df.iterrows():
    chunks = chunk_text(row['combined_text'], chunk_size=200, chunk_overlap=100)
    for i, chunk in enumerate(chunks):
        chunk_records.append({
            "doc_id": row['id'],
            "chunk_text": chunk,
            "turn_index": row.get('turn_index', None)
            # You can add more metadata here if needed.
        })

In [29]:
chunk_df = pd.DataFrame(chunk_records)
print(f"Created {len(chunk_df)} chunks from {len(subset_df)} documents.")

Created 3002 chunks from 500 documents.


### 4. Calculate Embeddings and build a FAISS index
---

In [30]:
def get_embedding(text, model="text-embedding-ada-002"):
    """
    Get the embedding of a text string using OpenAI's API.
    """
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding, dtype=np.float32)

In [31]:
def compute_embeddings(texts, engine="text-embedding-ada-002"):
    """
    Compute embeddings for a list of texts.
    """
    embeddings = []
    for text in texts:
        emb = get_embedding(text, model=engine)
        embeddings.append(emb)
    return np.vstack(embeddings)

Creating embeddings on chunked documents ~ 20mins

In [32]:
# Compute embeddings for all chunks
chunk_texts = chunk_df['chunk_text'].tolist()
print("Computing embeddings for {} chunks...".format(len(chunk_texts)))
document_embeddings = compute_embeddings(chunk_texts)
embedding_dim = document_embeddings.shape[1]

Computing embeddings for 3002 chunks...


In [34]:
print("Done!")

Done!


In [35]:
# Normalize embeddings for cosine similarity (using inner product search)
faiss.normalize_L2(document_embeddings)

In [36]:
# Build a Faiss index
index = faiss.IndexFlatIP(embedding_dim)
index.add(document_embeddings)
print("Faiss index built with {} vectors.".format(index.ntotal))

Faiss index built with 3002 vectors.


In [37]:
def query_dataset(query, top_n=3, engine="text-embedding-ada-002"):
    """
    Given a query string, compute its embedding and retrieve the top_n similar documents using Faiss.
    """
    query_embedding = get_embedding(query, model=engine).reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_n)
    results = df.iloc[indices[0]].copy()
    results['score'] = distances[0]
    return results

### 4. Query the model using gpt-4o-mini
---

In [43]:
def generate_answer(query, context_docs, max_tokens=200):
    """
    Generate an answer by combining the query with retrieved context using OpenAI's GPT40-mini model.
    A system prompt is added so the model acts as a friendly financial analyst bot that does not make up answers
    and only uses the data it has access to in order to answer numerical questions.
    
    Parameters:
      query: The question string.
      context_docs: A list of context strings retrieved from the dataset.
      max_tokens: Maximum number of tokens for the generated answer.
    
    Returns:
      str: The generated answer.
    """
    # Combine the retrieved context documents into a single string.
    context = "\n".join(context_docs)
    
    # Create a messages list with a system prompt and the user's prompt.
    messages = [
         {
             "role": "system",
             "content": (
                 "You are a financial analyst bot who is extremely knowlegable about financial valuations, technical analysis and quantitative finance."
                 "Your job is to read through the documents provided and use the data to answer finance questions and financial calculation questions."
             )
         },
         {
             "role": "user",
             "content": f"Question: {query}\nContext: {context}\nAnswer:"
         }
    ]
    
    # Call the Chat Completion API with the messages.
    response = openai.chat.completions.create(
         model='gpt-4o-mini',
         messages=messages,
         max_tokens=max_tokens,
         temperature=0.1,
         top_p=1.0,
         n=3,
         stop=["\n"]
    )
    
    # Extract the generated answer.
    answer = response.choices[0].message.content.strip()
    return answer

### 5. Execute the RAG pipeline
---

In [44]:
query = "what is the net cash from operating activities in 2009?"
retrieved_results = query_dataset(query, top_n=3)
print("\nTop retrieved examples for query:", query)
context_docs = []
for i, row in retrieved_results.iterrows():
    snippet = row['combined_text'][:200] + "..." if len(row['combined_text']) > 200 else row['combined_text']
    print(f"ID: {row['id']}, Turn index: {row.get('turn_index')}, Score: {row['score']:.3f}")
    print("Context snippet:", snippet)
    print("----------")
    context_docs.append(snippet)
    
generated_answer = generate_answer(query, context_docs)
print("\nGenerated Answer:")
print(generated_answer)


Top retrieved examples for query: what is the net cash from operating activities in 2009?
ID: Single_JKHY/2009/page_28.pdf-3, Turn index: 2, Score: 0.906
Context snippet: Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segmen...
----------
ID: Single_IPG/2016/page_46.pdf-1, Turn index: 0, Score: 0.893
Context snippet: Pre-Text: item 7a . quantitative and qualitative disclosures about market risk ( amounts in millions ) in the normal course of business , we are exposed to market risks related to interest rates , for...
----------
ID: Single_IPG/2016/page_46.pdf-1, Turn index: 1, Score: 0.891
Context snippet: Pre-Text: item 7a . quantitative and qualitative disclosures about market risk ( amounts in millions ) in the normal course of business , we are exposed to market risks related to interest rates , for...
----------

Generated Answer:
To p

### 6. Execute the RAG Pipeline by using Hugging Face Transformers
---

In [55]:
generator = pipeline("text2text-generation", model="t5-small")

def generate_answer(query, context_docs, max_length=200):
    """
    Generate an answer by combining the query with retrieved context.
    """
    context = "\n".join(context_docs)
    prompt = f"question: {query}\ncontext: {context}\nanswer:"

    # prompt = f"""
    # You are a helpful financial analyst bot. Answer the question strictly based on the given context.
    
    # Question: {query}
    # Context: {context}
    # Answer:
    # """

    result = generator(prompt, max_length=max_length, do_sample=False)
    return result[0]['generated_text']

Device set to use mps:0


In [56]:
query = "what is the net cash from operating activities in 2009?"
retrieved_results = query_dataset(query, top_n=3)
print("\nTop retrieved examples for query:", query)
context_docs = []
for i, row in retrieved_results.iterrows():
    snippet = row['combined_text'][:200] + "..." if len(row['combined_text']) > 200 else row['combined_text']
    print(f"ID: {row['id']}, Turn index: {row.get('turn_index')}, Score: {row['score']:.3f}")
    print("Context snippet:", snippet)
    print("----------")
    context_docs.append(snippet)
    
generated_answer = generate_answer(query, context_docs)
print("\nGenerated Answer:")
print(generated_answer)


Top retrieved examples for query: what is the net cash from operating activities in 2009?
ID: Single_JKHY/2009/page_28.pdf-3, Turn index: 2, Score: 0.906
Context snippet: Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segmen...
----------
ID: Single_IPG/2016/page_46.pdf-1, Turn index: 0, Score: 0.893
Context snippet: Pre-Text: item 7a . quantitative and qualitative disclosures about market risk ( amounts in millions ) in the normal course of business , we are exposed to market risks related to interest rates , for...
----------
ID: Single_IPG/2016/page_46.pdf-1, Turn index: 1, Score: 0.891
Context snippet: Pre-Text: item 7a . quantitative and qualitative disclosures about market risk ( amounts in millions ) in the normal course of business , we are exposed to market risks related to interest rates , for...
----------

Generated Answer:
14%
