# Electric Bill Parser Development

In [39]:
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Optional
import csv
import pdfplumber
import re
import os

@dataclass
class BillLineItem:
    bill_date: str
    category: str
    description: str
    quantity: Optional[float] = None
    unit: Optional[str] = None
    rate: Optional[float] = None
    amount: float = 0.0
    
    def to_csv_row(self):
        return [self.bill_date, self.category, self.description, 
                self.quantity or '', self.unit or '', self.rate or '', self.amount]

def extract_bill_date(pages):
    """Extract bill date from PDF text"""
    all_text = ' '.join(pages)
    # Try common date patterns
    patterns = [
        r'\d{12} (\d{2}/\d{2}/\d{2})'
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, all_text)
        if matches:
            return matches[0]
    
    # Fallback: extract from filename if available
    filename_match = re.search(r'(\d{4})_(\d{2})_(\d{2})', file)
    if filename_match:
        raise Exception("warning: we matched filename match instead of date pattern.  fix this code")
        year, month, day = filename_match.groups()
        return f"{month}/{day}/{year}"
    
    return "Unknown"

def parse_pages(pages):
    """Unified function to extract all line items from electric bill"""
    bill_date = extract_bill_date(pages)
    line_items = []
    all_text = ' '.join(pages)
    
    # Delivery usage charges (Time of Use)
    delivery_matches = re.findall(r'(Onpeak|Midpeak|Offpeak|Superoffpeak) (\d{1,3}(?:,\d{3})*)kWhx\$(\d+\.\d+) \$(\d+.\d+)', all_text)
    for tou, kwh, rate, amount in delivery_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery",
            description=f"{tou} Usage",
            quantity=float(kwh.replace(',', '')),
            unit="kWh",
            rate=float(rate),
            amount=float(amount)
        ))
    
    # Basic charge
    basic_matches = re.findall(r'(Basiccharge) (\d+)daysx\$(\d\.\d+) \$(\d.\d+)', all_text)
    for _, days, rate, amount in basic_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery Surcharge",
            description="Basic Charge",
            quantity=float(days),
            unit="days",
            rate=float(rate),
            amount=float(amount)
        ))
    
    # Delivery surcharges
    surcharge_matches = re.findall(r'(Baselinecredit|PCIA|CCAwildfirefundcharge|CTC|Fixedrecoverycharge) (\d{1,3}(?:,\d{3})*)kWhx(-?\$\d.\d+) (-?\$\d+.\d+)', all_text)
    for charge_type, kwh, rate, amount in surcharge_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Delivery Surcharge",
            description=charge_type.replace('charge', ' charge').replace('credit', ' credit'),
            quantity=float(kwh.replace(',', '')),
            unit="kWh",
            rate=float(rate.replace('$', '')),
            amount=float(amount.replace('$', ''))
        ))

    # Generation municipal surcharge
    municipal_matches = re.findall(r'(GenerationMunicipalSurcharge) \$(\d+.\d+)', all_text)
    for charge_type, amount in municipal_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="Municipal Surcharge",
            amount=float(amount)
        ))
         
    # Generation CA Climate Credit
    municipal_matches = re.findall(r'(CAClimateCredit) (-\$\d+.\d+)', all_text)
    for charge_type, amount in municipal_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="CA Climate Credit",
            amount=float(amount.replace('$', ''))
        ))

    # Generation usage charges
    gen_matches = re.findall(r'100%GreenPower-(.*) (\d+.\d+)kWh@(\d+.\d+) \$(\d+.\d+)', all_text)    
    for tou, kwh, rate, amount in gen_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation",
            description=f"Green Power {tou}",
            quantity=float(kwh),
            unit="kWh",
            rate=float(rate),
            amount=float(amount)
        ))
         
    # Energy surcharge
    energy_matches = re.findall(r'(EnergySurcharge) \$(\d+.\d+)', all_text)
    for charge_type, amount in energy_matches:
        line_items.append(BillLineItem(
            bill_date=bill_date,
            category="Generation Surcharge",
            description="Energy Surcharge",
            amount=float(amount)
        ))
         
    return line_items
         

def get_pages(file):
    pages = []
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            pages.append(text)

    return pages

def parse_file(file):
    pages = get_pages(file)
    line_items = parse_pages(pages)
    return line_items

def write_to_csv(line_items, filename="electric_bill.csv"):
    headers = ['bill_date', 'category', 'description', 'quantity', 'unit', 'rate', 'amount']
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
        for item in line_items:
            writer.writerow(item.to_csv_row())
    
def parse_files(files, dest_dir):
    all_line_items = []
    for src_file in files:
        dest_file = Path(dest_dir) / Path(Path(src_file).stem).with_suffix('.csv')
        line_items = parse_file(src_file)
        all_line_items.extend(line_items)
        write_to_csv(line_items, dest_file)
        print(f"Wrote {dest_file}, {len(line_items)} lines, Total amount: ${sum(item.amount for item in line_items):.2f}")
    dest_file = Path(dest_dir) / Path("electric_bills.csv")
    write_to_csv(all_line_items, dest_file)
    print(f"Wrote {dest_file}, {len(all_line_items)} lines, Total amount: ${sum(item.amount for item in all_line_items):.2f}")
            
        

In [40]:
src_dir = "/Users/ericmelz/Data/Bills/Electric/PDF/2025" 
files = []
for file in os.listdir(src_dir):
    files.append(Path(src_dir) / file)
dest_dir = "/Users/ericmelz/Data/Bills/Electric/CSV"

In [41]:
files = [ Path(src_dir) / "2025_04_07.pdf" ]

In [42]:
parse_files(files, dest_dir)

Wrote /Users/ericmelz/Data/Bills/Electric/CSV/2025_04_07.csv, 15 lines, Total amount: $754.42
Wrote /Users/ericmelz/Data/Bills/Electric/CSV/electric_bills.csv, 15 lines, Total amount: $754.42


In [36]:
pages = get_pages(files[0])

In [37]:
page4 = pages[4]
page5 = pages[5]

In [38]:
page4

'MELZ,ERIC/ Page5of8\nDetails of your new charges\n(continued)\nOffpeak 899kWhx$0.30272 $272.15\nSuperoffpeak 762kWhx$0.27638 $210.60\nCAClimateCredit -$56.00\nCCAcost responsibility surcharge Your Delivery charges include:\nPCIA 1,935kWhx-$0.01227 -$23.74 $47.14transmissioncharges\nCCAwildfirefundcharge 1,935kWhx$0.00595 $11.51 ·$410.80distributioncharges\nCTC 1,935kWhx-$0.00058 -$1.12 ·-$0.02nucleardecommissioning\n·charges\nOther charges or credits $68.61publicpurposeprograms\nFixedrecoverycharge 1,935kWhx$0.00198 $3.83 ·charge\nGenerationMunicipalSurcharge $1.92 $18.97newsystemgeneration\n·charge\nSubtotalofyournewcharges $483.45\nYour new charges $483.45\nYour overall energy charges include:\n$4.69franchisefees\n·\nAdditional information:\nServicevoltage:240volts\n·GenerationMunicipalSurcharge\n·(GMS)factor:0.009261\n2018VintageCRS\n·\nRate Identification Number - RIN SUM1\nUSCA-SCXX-0500-0000\nInthefuture,youmightusetheRateIdentificationNumber(RIN)toprogramsmartdeviceslikesmartth