In [10]:
import os
from dotenv import load_dotenv
load_dotenv('key.env')  

key_string = os.getenv('open_ai_API_Key')

In [11]:
import re
from langchain.document_loaders import PyPDFLoader
from langchain.chains import SequentialChain
from langchain.prompts import PromptTemplate

In [12]:
# Load PDF
loader_1 = PyPDFLoader("10-Q4-2024-As-Filed.pdf")
document_1 = loader_1.load()
loader_2 = PyPDFLoader("10-k-2022.pdf")
document_2 = loader_2.load()
loader_3 = PyPDFLoader("10k-2020.pdf")
document_3 = loader_3.load()
loader_4 = PyPDFLoader("10k-2018.pdf")
document_4 = loader_4.load()
loader_5 = PyPDFLoader("10k-2016.pdf")
document_5 = loader_5.load()

In [13]:
def page_finder(document):
    # Initialize variables to store page numbers
    start_page = None
    end_page = None

    # Find the page number where content starts with "Item 8"
    for doc in document:
        if doc.page_content.startswith("Item 8"):
            start_page = doc.metadata['page']
            break  # Exit the loop after finding the first match

    # Find the page number where content starts with "Item 9"
    for doc in document:
        if doc.page_content.startswith("Item 9"):
            end_page = doc.metadata['page']
            break  # Exit the loop after finding the first match

    # If either page is still None (should not happen if items are guaranteed)
    if start_page is None or end_page is None:
        raise ValueError("Could not find 'Item 8' or 'Item 9' in the document")

    return start_page, end_page


def page_executor(document):
    # Find the start and end pages
    start_page, end_page = page_finder(document)

    # Extract documents between the start and end pages
    extracted_documents = [
        doc for doc in document
        if start_page <= doc.metadata['page'] < end_page
    ]

    return extracted_documents

In [14]:
extract_2024 = page_executor(document_1)

In [15]:
extract_2022 = page_executor(document_2)
extract_2018 = page_executor(document_3)
extract_2016 = page_executor(document_4)
extract_2014 = page_executor(document_5)

In [35]:
from langchain.chains import SequentialChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.agents import AgentExecutor, Tool
from langchain.agents import ZeroShotAgent

# Define the system prompt
extract_prompt = PromptTemplate(
    input_variables=["document"],
    template="""
    The following is a section of a 10-K filing:
    {document}

    Extract the balance sheet, cashflow , STATEMENTS OF OPERATIONS and shareholder's equity as a structured table. Include column names and all rows. Prepare different tables for balance sheet, cashflow, STATEMENTS OF OPERATIONS and shareholder's equity
    Provide the table as a JSON object.

    strucutre the table as follows:
    Item, Category, Subcategory, <Date-1>,<Date-2>,<Date-3>
    """
)

llm = ChatOpenAI(model="gpt-4o",temperature=0.5, openai_api_key=key_string)

# Define the first chain
extract_chain = LLMChain(
    llm=llm,
    prompt=extract_prompt,
    output_key="balance_sheet",  # This key will pass to the next chain
)

# Define the second prompt
convert_prompt = PromptTemplate(
    input_variables=["balance_sheet"],
    template="""
    Convert the following balance sheet, cashflow, STATEMENTS OF OPERATIONS and shareholder's equity JSON into a TSV format. Return only the TSV data. Separate the 4 tables with headers

    JSON:
    {balance_sheet}
    """
)

# Define the second chain
convert_chain = LLMChain(
    llm=llm,
    prompt=convert_prompt,
    output_key="csv_output",
)

# Combine the chains
sequential_chain = SequentialChain(
    chains=[extract_chain, convert_chain],
    input_variables=["document"],
    output_variables=["csv_output"],
)

In [36]:
result = sequential_chain.run(document=extract_2024)

In [37]:
print(result)

```
### Consolidated Balance Sheet
Item	Category	Subcategory	September 28, 2024	September 30, 2023
Assets	Current Assets	Cash and cash equivalents	29943	29965
Assets	Current Assets	Marketable securities	35228	31590
Assets	Current Assets	Accounts receivable, net	33410	29508
Assets	Current Assets	Vendor non-trade receivables	32833	31477
Assets	Current Assets	Inventories	7286	6331
Assets	Current Assets	Other current assets	14287	14695
Assets	Total Current Assets		152987	143566
Assets	Non-current Assets	Marketable securities	91479	100544
Assets	Non-current Assets	Property, plant and equipment, net	45680	43715
Assets	Non-current Assets	Other non-current assets	74834	64758
Assets	Total Non-current Assets		211993	209017
Assets	Total Assets		364980	352583
Liabilities and Shareholders’ Equity	Current Liabilities	Accounts payable	68960	62611
Liabilities and Shareholders’ Equity	Current Liabilities	Other current liabilities	78304	58829
Liabilities and Shareholders’ Equity	Current Liabilities	Defe

In [38]:
with open('balance_sheet_22.tsv', 'w') as file:
    file.write(result)

print("The balance sheet data has been saved to 'balance_sheet_3.tsv'.")

The balance sheet data has been saved to 'balance_sheet_3.tsv'.
