In [None]:
import pdfplumber
import re
import json

def parse_transaction_line(line):
    # More specific pattern that captures each component'
    pattern = r'^(\d{2}/\d{2})\s+(.+?)\s+(-?[\d,]+\.\d{2})\s+(-?[\d,]+\.\d{2})$'
    match = re.match(pattern, line.strip())
    
    if match:
        date, description, amount, balance = match.groups()
        return {
            'date': date,
            'description': description.strip(),
            'amount': amount,
            'balance': balance
        }
    return None

def parse_Credit_Card_transaction_line(line):
    # More specific pattern that captures each component'
    pattern = r'^(\d{2}/\d{2})\s+(.+?)\s+(-?[\d,]+\.\d{2})$'
    match = re.match(pattern, line.strip())
    
    if match:
        date, description, amount = match.groups()
        return {
            'date': date,
            'description': description.strip(),
            'amount': amount
        }
    return None

# 
def extract_transactions_CA_from_pdf(pdf_path):
    transactions = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            for line in text.split("\n"):
                parsed = parse_transaction_line(line)
                if parsed:
                    transactions.append(parsed)
    return transactions

def extract_transactions_CreditCard_from_pdf(pdf_path):
    transactions = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            for line in text.split("\n"):
                parsed = parse_Credit_Card_transaction_line(line)
                if parsed:
                    transactions.append(parsed)
    return transactions
# print(transactions, len(transactions))  # check first 10 extracted lines


In [None]:
# Load environment variables in a file called .env
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

In [49]:
import pandas as pd

# ---------- STEP 3: Build prompts ----------

def build_prompts(transactions):
    system_prompt = """
You are a personal financial assistant.
Your job is to analyze bank transactions, categorize each expense into categories such as:
Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.

Your responsibilities:

Categorize all transactions and compute total spending per category.

Identify the top 5 categories by total spending.

Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).

For these, group transactions by merchant/description and count frequency.

Highlight the top 5 frequent purchases, with both frequency and total spend.

Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.

Suggest 2–3 actionable recommendations to reduce spending, targeting both:

Big categories (e.g., Rent, Travel, Entertainment).

Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).

The output should be a valid JSON object with this structure:
{
  "summary": {
      "Food": <amount>,
      "Clothing": <amount>,
      "Rent": <amount>,
      "Utilities": <amount>,
      "Entertainment": <amount>,
      "Travel": <amount>,
      "Health": <amount>,
      "Miscellaneous": <amount>,
      "Others": <amount>
  },
  "total_expenses": <total>,
  "top_5_categories": [ {"category": <name>, "amount": <amount>} ],
  "top_5_frequent_purchases": [ {"item": <merchant/description>, "count": <frequency>, "total": <amount>} ],
  "insights": "<short paragraph summary of spending, including both big categories and frequent small habits>",
  "recommendations": [ "<tip1>", "<tip2>", "<tip3>" ]
}

"""

    user_prompt = "Here are my bank account transactions for the past few months:\n\n"
    for txn in transactions:
        user_prompt += f"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\n"

    user_prompt += """
Please analyze these transactions according to the instructions in the system prompt.
"""

    return system_prompt, user_prompt





In [None]:
# ---------- STEP 4: Call OpenAI ----------
def analyze_transactions(pdf_path):
    transactions = extract_transactions_CreditCard_from_pdf(pdf_path)
    system_prompt, user_prompt = build_prompts(transactions)

    client = OpenAI()  # assumes OPENAI_API_KEY is set in env

    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={"type": "json_object"}  # ensures valid JSON
    )

    result = response.choices[0].message.content
    return json.loads(result)

# ---------- MAIN ----------
if __name__ == "__main__":
    cc_pdf_file = "cc_statement.pdf"
    # To Debug in case of failures
    # transactions = extract_transactions_from_pdf(pdf_file)
    # print(cc_transactions,len(cc_transactions))
    # system_prompt, user_prompt = build_prompts(cc_transactions)
    # print(system_prompt, user_prompt)

    # Analyse the function to create a smart alert
    cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)
    analysis = analyze_transactions(cc_pdf_file)
    print("=========================================")
    print("=== Top 5 Spending Habits & Insights ====")
    print("=========================================")
    print(json.dumps(analysis, indent=2))
    print("=========================================")
    print("=========================================")
