In [55]:
import os


def find_report_path(directory:str,search_term:str) -> str:
    for filename in os.listdir(directory):
        # print(f"Checking file: {filename}")
        if search_term.lower() in filename.lower():
            return os.path.join(directory, filename)
    return None


In [56]:

report_directory:str = "original_reports"

balance_sheet_path:str = find_report_path(report_directory, "balance")
income_statement_path:str = find_report_path(report_directory, "income")
cash_flow_statement_path:str = find_report_path(report_directory, "cash")

if balance_sheet_path == None or income_statement_path == None or cash_flow_statement_path == None:
    raise FileNotFoundError("file not found in the original_reports directory")

print(f"balance sheet path: {balance_sheet_path}")
print(f"income statement path: {income_statement_path}")   
print(f"cash flow statement path: {cash_flow_statement_path}")

balance sheet path: original_reports\Balance Sheet_Annual_Restated.xls
income statement path: original_reports\Income Statement_Annual_Restated.xls
cash flow statement path: original_reports\Cash Flow_Annual_Restated.xls


In [57]:
import pandas as pd

balance_frame: pd.DataFrame = pd.read_excel(balance_sheet_path)
income_frame: pd.DataFrame = pd.read_excel(income_statement_path)
cash_flow_frame: pd.DataFrame = pd.read_excel(cash_flow_statement_path)

In [58]:
balance_frame

Unnamed: 0,3227_balance-sheet_Annual_Restated,2020,2021,2022,2023,2024
0,Total Assets,1.188062e+10,1.351862e+10,1.128365e+10,1.217586e+10,1.424044e+10
1,Total Current Assets,7.387895e+09,7.863602e+09,6.239764e+09,6.718669e+09,8.191824e+09
2,"Cash, Cash Equivalents and Short Term ...",5.062153e+09,5.745379e+09,4.529492e+09,4.892953e+09,5.980679e+09
3,Cash and Cash Equivalents,4.292172e+09,4.689767e+09,3.862605e+09,4.039696e+09,4.897832e+09
4,Short Term Investments,7.699810e+08,1.055612e+09,6.668870e+08,8.532570e+08,1.082847e+09
...,...,...,...,...,...,...
106,Total Contractual Obligations due Beyond,,1.549460e+08,1.499200e+08,1.436350e+08,1.375620e+08
107,Total Contractual Obligations - Interests ...,,-4.832300e+07,-4.275600e+07,-4.270600e+07,-3.953700e+07
108,Total Contractual Obligations due in year 2,,,,,
109,Total Contractual Obligations due in year 4,,,,,


In [59]:
income_frame

Unnamed: 0,3227_income-statement_Annual_Restated,2020,2021,2022,2023,2024,TTM
0,Gross Profit,4718360000.0,5022642000.0,2820222000.0,3366301000.0,5187044000.0,5655475000.0
1,Total Revenue,8148017000.0,8800642000.0,5225907000.0,5844521000.0,8362273000.0,9143383000.0
2,Business Revenue,8148017000.0,8800642000.0,5225907000.0,5844521000.0,8362273000.0,9143383000.0
3,Cost of Revenue,-3429657000.0,-3778000000.0,-2405685000.0,-2478220000.0,-3175229000.0,-3487908000.0
4,Cost of Goods and Services,-3429657000.0,-3778000000.0,-2405685000.0,-2478220000.0,-3175229000.0,-3487908000.0
5,Operating Income/Expenses,-2869355000.0,-3197841000.0,-2811302000.0,-2652199000.0,-3428387000.0,-3643531000.0
6,"Selling, General and Administrative Expenses",-946378000.0,-1027567000.0,-903388000.0,-857260000.0,-1037499000.0,-978732000.0
7,General and Administrative Expenses,-573174000.0,-618914000.0,-575340000.0,-541774000.0,-651944000.0,-598085000.0
8,Selling and Marketing Expenses,-373204000.0,-408653000.0,-328048000.0,-315486000.0,-385555000.0,-380647000.0
9,Research and Development Expenses,-1922977000.0,-2170274000.0,-1911664000.0,-1805940000.0,-2423283000.0,-2664799000.0


In [60]:
cash_flow_frame

Unnamed: 0,3227_cash-flow_Annual_Restated,2020,2021,2022,2023,2024,TTM
0,"Cash Flow from Operating Activities, Indirect",1.656987e+09,2.763452e+09,2.954030e+08,1.290750e+09,2.509485e+09,2.767672e+09
1,Net Cash Flow from Continuing Operating Ac...,1.656987e+09,2.763452e+09,2.954030e+08,1.290750e+09,2.509485e+09,2.767672e+09
2,Cash Generated from Operating Activities,1.837774e+09,3.066733e+09,2.762290e+08,1.153979e+09,2.392490e+09,2.801770e+09
3,Income/Loss before Non-Cash Adjust...,1.776728e+09,1.925734e+09,1.094749e+09,8.853050e+08,2.047031e+09,2.161137e+09
4,Total Adjustments for Non-Cash Items,4.702310e+08,5.050760e+08,-4.294420e+08,3.566570e+08,3.627810e+08,4.659510e+08
...,...,...,...,...,...,...,...
83,Effect of Exchange Rate Changes,-7.061000e+07,-4.038300e+07,5.917000e+07,-1.053200e+07,8.476000e+07,-5.181500e+07
84,"Cash and Cash Equivalents, Beginning of Pe...",4.719575e+09,4.292172e+09,4.689767e+09,3.862605e+09,4.039696e+09,4.263402e+09
85,Cash Flow Supplemental Section,,,,,,
86,"Change in Cash as Reported, Supplemental",-4.274030e+08,3.975950e+08,-8.271620e+08,1.770910e+08,8.581360e+08,9.602710e+08


In [61]:
from openai import OpenAI
import os
import dotenv
from financial_statement_schema.balance_sheet import BalanceSheets
from financial_statement_schema.income_statement import IncomeStatements
from financial_statement_schema.cashflow_statements import CashflowStatements

from typing import TypeVar
from pydantic import BaseModel
dotenv.load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client:OpenAI = OpenAI(api_key=api_key)

T = TypeVar("T",bound=BaseModel)

def generate_report(original_df:pd.DataFrame,output_path:str, schema: T) -> T:
    response = client.responses.parse(
        model= "gpt-4.1-nano",
        text_format=schema,
        input= [
            {"role": "system","content": "you are a financial data extraction assistant. Extract the relevant financial data from the provided financial statement and format it according to the provided schema."},
            {"role":"user","content": original_df.to_csv(sep=",",index=False)}
        ]
    )
    report: T = response.output_parsed
    with open(output_path,"w") as f:
        f.write(report.model_dump_json())
    return report


balance_report_path:str = os.path.join(report_directory, "balance_statements.json")
balance_report: BalanceSheets = generate_report(balance_frame, balance_report_path, BalanceSheets)
print(balance_report)
cashflow_report_path:str = os.path.join(report_directory, "cashflow_statements.json")
cashflow_report = generate_report(cash_flow_frame, cashflow_report_path, CashflowStatements)
print(cashflow_report)
income_report_path:str = os.path.join(report_directory, "income_statements.json")
income_report:IncomeStatements = generate_report(income_frame, income_report_path, IncomeStatements)
print(income_report)

balance_sheets=[BalanceSheet(total_asset=11880621000.0, total_liability=2943135000.0, totalDebt=436553000.0, totalEquity=8937486000.0, current_assets=7387895000.0, current_liabilities=2381035000.0, shareholderEquity=8937486000.0, inventory=769456000.0, account_receivable=1491508000.0, account_payable=1908538000.0, date='2020'), BalanceSheet(total_asset=13518620000.0, total_liability=3322800000.0, totalDebt=447068000.0, totalEquity=10195820000.0, current_assets=7863602000.0, current_liabilities=2806521000.0, shareholderEquity=10195820000.0, inventory=1046335000.0, account_receivable=1015372000.0, account_payable=2326171000.0, date='2021'), BalanceSheet(total_asset=11283652000.0, total_liability=2150857000.0, totalDebt=22800000.0, totalEquity=9132795000.0, current_assets=6239764000.0, current_liabilities=1661447000.0, shareholderEquity=9132795000.0, inventory=988022000.0, account_receivable=637041000.0, account_payable=1605318000.0, date='2022'), BalanceSheet(total_asset=12175855000.0, t