# Electric Bill Parser Development

In [37]:
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Optional
import csv
import pdfplumber
import re
import os

@dataclass
class BillLineItem:
    bill_date: str
    category: str
    description: str
    quantity: Optional[float] = None
    unit: Optional[str] = None
    rate: Optional[float] = None
    amount: float = 0.0
    
    def to_csv_row(self):
        return [self.bill_date, self.category, self.description, 
                self.quantity or '', self.unit or '', self.rate or '', self.amount]

def extract_bill_date(pages):
    """Extract bill date from PDF text"""
    all_text = ' '.join(pages)
    # Try common date patterns
    patterns = [
        r'\d{12} (\d{2}/\d{2}/\d{2})'
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, all_text)
        if matches:
            return matches[0]
    
    # Fallback: extract from filename if available
    filename_match = re.search(r'(\d{4})_(\d{2})_(\d{2})', file)
    if filename_match:
        raise Exception("warning: we matched filename match instead of date pattern.  fix this code")
        year, month, day = filename_match.groups()
        return f"{month}/{day}/{year}"
    
    return "Unknown"

def parse_pages(pages):
    """Unified function to extract all line items from electric bill"""
    bill_date = extract_bill_date(pages)
    line_items = []
    all_text = ' '.join(pages)
    
    # Delivery usage charges (Time of Use)
    delivery_matches = re.findall(r'(Onpeak|Midpeak|Offpeak|Superoffpeak) (\d{1,3}(?:,\d{3})*)kWhx\$(\d+\.?\d+) \$(\d+\.\d+)', all_text)
    for tou, kwh, rate, amount in delivery_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery",
            description=f"{tou} Usage",
            quantity=float(kwh.replace(',', '')),
            unit="kWh",
            rate=float(rate),
            amount=float(amount)
        ))
    
    # Basic charge
    basic_matches = re.findall(r'(Basiccharge) (\d+)daysx\$(\d\.?\d+) \$(\d.?\d+)', all_text)
    for _, days, rate, amount in basic_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery Surcharge",
            description="Basic Charge",
            quantity=float(days),
            unit="days",
            rate=float(rate),
            amount=float(amount)
        ))
    
    # Delivery surcharges
    surcharge_matches = re.findall(r'(Baselinecredit|PCIA|CCAwildfirefundcharge|CTC|Fixedrecoverycharge) (\d{1,3}(?:,\d{3})*)kWhx(-?\$\d.?\d+) (-?\$\d+.?\d+)', all_text)
    for charge_type, kwh, rate, amount in surcharge_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery Surcharge",
            description=charge_type.replace('charge', ' charge').replace('credit', ' credit'),
            quantity=float(kwh.replace(',', '')),
            unit="kWh",
            rate=float(rate.replace('$', '')),
            amount=float(amount.replace('$', ''))
        ))

    # Generation municipal surcharge
    municipal_matches = re.findall(r'(GenerationMunicipalSurcharge) \$(\d+.?\d+)', all_text)
    for charge_type, amount in municipal_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="Municipal Surcharge",
            amount=float(amount)
        ))
         
    # Generation CA Climate Credit
    municipal_matches = re.findall(r'(CAClimateCredit) (-\$\d+.?\d+)', all_text)
    for charge_type, amount in municipal_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="CA Climate Credit",
            amount=float(amount.replace('$', ''))
        ))

    # Generation usage charges
    gen_matches = re.findall(r'100%GreenPower-(.*) (\d+.?\d+)kWh@(\d+.\d+) \$(\d+.\d+)', all_text)    
    for tou, kwh, rate, amount in gen_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation",
            description=f"Green Power {tou}",
            quantity=float(kwh),
            unit="kWh",
            rate=float(rate),
            amount=float(amount)
        ))
         
    # Energy surcharge
    energy_matches = re.findall(r'(EnergySurcharge) \$(\d+.?\d+)', all_text)
    for charge_type, amount in energy_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="Energy Surcharge",
            amount=float(amount)
        ))
         
    return line_items
         

def get_pages(file):
    pages = []
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            pages.append(text)

    return pages

def parse_file(file):
    pages = get_pages(file)
    line_items = parse_pages(pages)
    return line_items

def write_to_csv(line_items, filename="electric_bill.csv"):
    headers = ['bill_date', 'category', 'description', 'quantity', 'unit', 'rate', 'amount']
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
        for item in line_items:
            writer.writerow(item.to_csv_row())
    
def parse_files(files, dest_dir):
    actual_totals = {}
    all_line_items = []
    for src_file in files:
        dest_file = Path(dest_dir) / Path(Path(src_file).stem).with_suffix('.csv')
        line_items = parse_file(src_file)
        all_line_items.extend(line_items)
        write_to_csv(line_items, dest_file)
        amount = f'{sum(item.amount for item in line_items):.2f}'
        bill_date = Path(src_file).stem
        actual_totals[bill_date] = amount
        print(f"Wrote {dest_file}, {len(line_items)} lines, Total amount: ${amount}")
    dest_file = Path(dest_dir) / Path("electric_bills.csv")
    write_to_csv(all_line_items, dest_file)
    print(f"Wrote {dest_file}, {len(all_line_items)} lines, Total amount: ${sum(item.amount for item in all_line_items):.2f}")
    return actual_totals

def test_totals(expected, actual):
    for k, v in actual.items():
        assert expected[k] == v, f"For {k} expected {expected[k]} but got {v}"

In [56]:
EXPECTED_TOTALS = {
    '2024_01_08': '670.57',
    '2024_02_06': '700.36',
    '2024_03_08': '759.51',
    '2024_04_08': '597.39',
    '2024_05_07': '714.01',
    '2024_06_06': '613.85',
    '2024_07_08': '565.10',
    '2024_08_07': '654.61',
    '2024_09_06': '739.13',
    '2024_10_07': '668.68',
    '2024_11_05': '554.15',
    '2024_12_06': '776.72',
    '2025_01_07': '774.89',
    '2025_02_05': '969.10',
    '2025_03_07': '894.63',
    '2025_04_07': '754.42',
    '2025_05_07': '808.63',
    '2025_06_06': '630.18',
    '2025_07_09': '627.18',
}

dest_dir = "/Users/ericmelz/Data/Bills/Electric/CSV"
#years = ['2024', '2025']
years = ['2023']
files = []

for year in years:
    src_dir = Path("/Users/ericmelz/Data/Bills/Electric/PDF") / year
    for file in os.listdir(src_dir):
        files.append(Path(src_dir) / file)


In [54]:
actual_totals = parse_files(files, dest_dir)

Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_09_06.csv, 14 lines, Total amount: $739.13
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_05_07.csv, 14 lines, Total amount: $714.01
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_04_08.csv, 15 lines, Total amount: $597.39
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_11_05.csv, 14 lines, Total amount: $554.15
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_03_08.csv, 22 lines, Total amount: $759.51
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_06_06.csv, 21 lines, Total amount: $613.85
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_08_07.csv, 14 lines, Total amount: $654.61
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_01_08.csv, 22 lines, Total amount: $670.57
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_12_06.csv, 14 lines, Total amount: $776.72
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_07_08.csv, 16 lines, Total amount: $565.10
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2024_02_06.csv

In [55]:
test_totals(EXPECTED_TOTALS, actual_totals)

# Scratchpad for testing

In [5]:
pages = get_pages(files[9])

In [6]:
page4 = pages[4]
page5 = pages[5]

In [7]:
page5

"MELZ,ERIC/ Page6of6\nService account 8001779471 POD-ID SUPPLY/GENERATION\nService address 300SREEVESDR 101760940001603559\nCLEANPOWERALLIANCE\nBEVERLYHILLS,CA90212\nsupplies your electricity\nRotating outage GroupA037\nDetails of your new charges SUM2\nCLEAN POWERALLIANCE\nYourrate:TOU-D-5\nServiceAccount:8001779471\nBillingperiod:06/04/24to07/02/24(29days)\nGeneration Charges\n100%GreenPower-On-Peak- 114kWh@0.4539 $51.74\nSummer\n100%GreenPower-Mid-Peak- 52.4kWh@0.26407 $13.84\nSummer\n100%GreenPower-Off-Peak- 1140.4kWh@0.1161 $132.40\nSummer\n100%GreenPower-On-Peak- 11.2kWh@0.4824 $5.40\nSummer\n100%GreenPower-Mid-Peak- 0kWh@0.26941 $0.00\nSummer\n100%GreenPower-Off-Peak- 88kWh@0.11171 $9.83\nSummer\nEnergySurcharge $0.42\nSub-TotalofCPAGenerationCharges $213.63\nYour NewCharges $213.63\nRate Identification Number - RIN SUM2\nUSCA-XXCP-0110-0000\nThisRINmayhelpprogramsmartdevices.Learnmoreatenergy.ca.gov.\nThings you should know\nCPA to implement new rates beginning July 1,2024\nCPA

In [8]:
re.findall(r'100%GreenPower-(.*) (\d+.?\d+)kWh@(\d+.\d+) \$(\d+.\d+)', page5)

[('On-Peak-', '114', '0.4539', '51.74'),
 ('Mid-Peak-', '52.4', '0.26407', '13.84'),
 ('Off-Peak-', '1140.4', '0.1161', '132.40'),
 ('On-Peak-', '11.2', '0.4824', '5.40'),
 ('Off-Peak-', '88', '0.11171', '9.83')]