In [14]:
import os


def find_report_path(directory:str,search_term:str) -> str:
    for filename in os.listdir(directory):
        # print(f"Checking file: {filename}")
        if search_term.lower() in filename.lower():
            return os.path.join(directory, filename)
    return None


In [15]:

report_directory:str = "original_reports"

balance_sheet_path:str = find_report_path(report_directory, "balance")
income_statement_path:str = find_report_path(report_directory, "income")
cash_flow_statement_path:str = find_report_path(report_directory, "cash")

if balance_sheet_path == None or income_statement_path == None or cash_flow_statement_path == None:
    raise FileNotFoundError("file not found in the original_reports directory")

print(f"balance sheet path: {balance_sheet_path}")
print(f"income statement path: {income_statement_path}")   
print(f"cash flow statement path: {cash_flow_statement_path}")

balance sheet path: original_reports\Balance Sheet_Annual_As Originally Reported.xls
income statement path: original_reports\Income Statement_Annual_As Originally Reported (1).xls
cash flow statement path: original_reports\Cash Flow_Annual_As Originally Reported.xls


In [16]:
import pandas as pd

balance_frame: pd.DataFrame = pd.read_excel(balance_sheet_path)
income_frame: pd.DataFrame = pd.read_excel(income_statement_path)
cash_flow_frame: pd.DataFrame = pd.read_excel(cash_flow_statement_path)

In [17]:
from openai import OpenAI
import os
import dotenv
from financial_statement_schema.balance_sheet import BalanceSheets
from financial_statement_schema.income_statement import IncomeStatements
from financial_statement_schema.cashflow_statements import CashflowStatements

from typing import TypeVar
from pydantic import BaseModel
dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client:OpenAI = OpenAI(api_key=api_key)

T = TypeVar("T",bound=BaseModel)

def generate_report(original_df:pd.DataFrame,output_path:str, schema: T) -> T:
    response = client.responses.parse(
        model= "gpt-4.1-nano",
        text_format=schema,
        input= [
            {"role": "system","content": "you are a financial data extraction assistant. Extract the relevant financial data from the provided financial statement and format it according to the provided schema."},
            {"role":"user","content": original_df.to_csv(sep=",",index=False)}
        ]
    )
    report: T = response.output_parsed
    with open(output_path,"w") as f:
        f.write(report.model_dump_json())
    return report


balance_report_path:str = os.path.join(report_directory, "balance_statements.json")
balance_report: BalanceSheets = generate_report(balance_frame, balance_report_path, BalanceSheets)
print(balance_report)
cashflow_report_path:str = os.path.join(report_directory, "cashflow_statements.json")
cashflow_report = generate_report(cash_flow_frame, cashflow_report_path, CashflowStatements)
print(cashflow_report)
income_report_path:str = os.path.join(report_directory, "income_statements.json")
income_report:IncomeStatements = generate_report(income_frame, income_report_path, IncomeStatements)
print(income_report)

balance_sheets=[BalanceSheet(total_asset=428822445000.0, total_liability=344896366000.0, totalDebt=127364296000.0, totalEquity=83926079000.0, current_assets=361960347000.0, current_liabilities=313699489000.0, shareholderEquity=71565777000.0, inventory=95053647000.0, account_receivable=128988137000.0, account_payable=114721606000.0, date='2020'), BalanceSheet(total_asset=497297806000.0, total_liability=404813027000.0, totalDebt=174791705000.0, totalEquity=92484779000.0, current_assets=421786551000.0, current_liabilities=365583359000.0, shareholderEquity=77916938000.0, inventory=161378122000.0, account_receivable=161933944000.0, account_payable=168384068000.0, date='2021'), BalanceSheet(total_asset=432907774000.0, total_liability=314255218000.0, totalDebt=140186227000.0, totalEquity=118652556000.0, current_assets=342985667000.0, current_liabilities=286725416000.0, shareholderEquity=96382149000.0, inventory=156889151000.0, account_receivable=101093154000.0, account_payable=108849916000.0,