In [142]:
# Packages
import os
import cv2
import numpy as np
import pandas as pd
import base64

from pdf2image import convert_from_path
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# For api keys
load_dotenv()

True

In [155]:
def pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    return images

# Preprocess image for better OCR
def preprocess_image(image, pdf_path,i):
    img = np.array(image)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    thresholded = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    # Save the preprocessed image and create folders to store them
    os.makedirs(f"./data/{os.path.splitext(pdf_path)[0]}", exist_ok=True)
    cv2.imwrite(f"./data/{os.path.splitext(pdf_path)[0]}/preprocessed_page_{i+1}.png", thresholded)

folder_path = "./data/"
pdf_files = [file for file in os.listdir(folder_path) if file.endswith(".pdf")]

for pdf_path in pdf_files:
    images = pdf_to_images(folder_path + pdf_path)

    # Preprocess and save image
    for i, image in enumerate(images):
        preprocess_image(image, pdf_path, i)

In [145]:
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY'],
)

def ask_gpt_to_parse_financials():
     for j in range(1, 2):

        png_files = []
        for i in range(1, 11):  # Range from 1 to 10 (inclusive)
            png_files.append(f"preprocessed_page_{i}.png")

        def encode_image(image_path):
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')

        # Encode the .png files to base64
        base64_images_list = []  # List to store the base64 encoded images
        for png_file in png_files:
            try:
                encoded_string = encode_image(png_file)
                base64_images_list.append(encoded_string)  # Add to the list
            except Exception as e:
                print(f"Error encoding {png_file}: {e}")  #

        messages = [
            {"role": "system", "content": "You are a financial analyst."}
        ]

        # Iterate through the list of base64-encoded images
        for j, base64_image in enumerate(base64_images_list, start=1):  # start=1 for proper part numbering
            messages.append(
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""
                            This is part {j} of the data for the bank statement.
                            Go through the financials and write a report on whether we should give them a business loan.
                            Find insights on the total monthly deposits and withdrawals. 
                            Also insights on their big regular bills like rent, employee salary, utilities, etc. 
                            Also catch if they have any other outstanding loans! But overall just use common sense on if you would give them a loan or not based on how they spend their money.
                            """
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            )

        # Send the messages to the model
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages
        )

        return response.choices[0].message.content

response = ask_gpt_to_parse_financials()
display(Markdown(response))

### Financial Report for Loan Consideration

#### Overview

This report evaluates the financial health of Bethlehem Home Health Care Agency for the month of October 2017 to assess their eligibility for a business loan. Key aspects reviewed include total monthly deposits and withdrawals, regular expenses, and any indications of outstanding loans.

#### Total Monthly Deposits and Withdrawals

- **Total Deposits:** $87,684.11
- **Total Withdrawals:** $47,024.45

The business shows a positive cash flow with deposits significantly exceeding withdrawals. This suggests a stable income stream capable of supporting additional loan repayments.

#### Regular and Significant Expenses

- **Rent & Utilities:** No specific rent or utilities cheques were evident from the statement, but could be included in other expenses.
- **Employee Salaries:** Regular checks, such as the $3,777.34 on Oct 16, likely indicate salary-related expenses.
- **Regular Vendor Payments:**
  - Payments to Lawrence S. Bracitsky CPA, INC of $1,146.08 on Oct 2 and Oct 10 show recurring costs.
  - Checks written to various consultants and service providers such as $2,500 on Oct 11 for "Activities Therapy Plan Payment."

#### Outstanding Loans 

- There is no explicit mention of existing loans in the known transactions. However, $12,000 in transfers on Oct 12 and Oct 16 suggest internal fund movements or possibly payments to another account.

#### Additional Observations

- The account maintains a healthy daily balance, often above $20,000.
- There are regular small charges ($24.95) for service fees, indicating proper financial management with regular banking interactions.
- The company seems capable of handling multiple monthly expenses while preserving a positive balance.

#### Conclusion

Considering the consistent positive cash flow, ability to maintain a substantial ending balance, and lack of evidence for outstanding loans, the Bethlehem Home Health Care Agency appears to be a suitable candidate for a business loan. The financial management indicates prudent spending patterns, with significant attention to maintaining liquidity. Granting the loan would likely pose minimal risk to the lender, provided the company continues its current financial trajectory.

In [None]:
# TODO: put images in data/<pdf file name>/images
# for each images folder, get the response from gpt and store them into a list
# put the 4 different results in 4 text boxes
# save .ipynb as a pdf as well