In [None]:
from dataclasses import dataclass
from typing import Optional
import csv
from datetime import datetime

@dataclass
class BillLineItem:
    bill_date: str
    category: str
    description: str
    quantity: Optional[float] = None
    unit: Optional[str] = None
    rate: Optional[float] = None
    amount: float = 0.0
    
    def to_csv_row(self):
        return [self.bill_date, self.category, self.description, 
                self.quantity or '', self.unit or '', self.rate or '', self.amount]

# Electric Bill Parser Development

In [1]:
import pdfplumber
import re

In [2]:
file = "/Users/ericmelz/Data/Bills/Electric/2025/2025_07_09.pdf"

In [3]:
pages = []

In [4]:
with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        pages.append(text)

In [5]:
len(pages)

6

In [6]:
page3 = pages[3]
page4 = pages[4]
page5 = pages[5]

In [7]:
def extract_delivery_usage_charges(pages):
    data_rows = []
    for page in pages:
        delivery_matches = re.findall(r'(Onpeak|Midpeak|Offpeak) (\d{1,3}(?:,\d{3})*)kWhx\$(\d+\.\d+) \$(\d+.\d+)', page)
        for tou, kwh, rate, amount in delivery_matches:
            data_rows.append([tou, kwh.replace(',', ''), rate, amount])
    return data_rows

In [8]:
delivery_usage_charges = extract_delivery_usage_charges(pages)
delivery_usage_charges

[['Onpeak', '156', '0.34124', '53.23'],
 ['Midpeak', '53', '0.34124', '18.09'],
 ['Offpeak', '1374', '0.29399', '403.94']]

In [9]:
def extract_basic_charge(pages):
    data_rows = []
    for page in pages:
        matches = re.findall(r'(Basiccharge) (\d+)daysx\$(\d\.\d+) \$(\d.\d+)', page)
        for chargetype, days, rate, amount in matches:
            data_rows.append([chargetype, days, rate, amount])
    return data_rows    

In [10]:
basic_charge = extract_basic_charge(pages)
basic_charge

[['Basiccharge', '29', '0.03100', '0.90']]

In [11]:
def extract_delivery_surcharges(pages):
    data_rows = []
    for page in pages:
        matches = re.findall(r'(Baselinecredit|PCIA|CCAwildfirefundcharge|CTC|Fixedrecoverycharge) (\d{1,3}(?:,\d{3})*)kWhx(-?\$\d.\d+) (-?\$\d+.\d+)', page)
        for chargetype, kwh, rate, amount in matches:
            data_rows.append([chargetype, kwh.replace(',', ''), rate.replace('$', ''), amount.replace('$', '')])
    return data_rows    

In [12]:
delivery_surcharges = extract_delivery_surcharges(pages)
delivery_surcharges

[['Baselinecredit', '490', '-0.09250', '-45.33'],
 ['PCIA', '1583', '-0.01227', '-19.42'],
 ['CCAwildfirefundcharge', '1583', '0.00595', '9.42'],
 ['CTC', '1583', '-0.00058', '-0.92'],
 ['Fixedrecoverycharge', '1583', '0.00198', '3.13']]

In [13]:
re.findall(r'(GenerationMunicipalSurcharge) \$(\d+.\d+)', page4)

[('GenerationMunicipalSurcharge', '1.43')]

In [14]:
def extract_generation_municipal_surcharges(pages):
    data_rows = []
    for page in pages:
        matches = re.findall(r'(GenerationMunicipalSurcharge) \$(\d+.\d+)', page)
        for chargetype, amount in matches:
            data_rows.append([chargetype, amount])
    return data_rows    

In [15]:
generation_municipal_surcharge = extract_generation_municipal_surcharges(pages)
generation_municipal_surcharge

[['GenerationMunicipalSurcharge', '1.43']]

In [16]:
def extract_generation_usage_charges(pages):
    data_rows = []
    for page in pages:
        matches = re.findall(r'100%GreenPower-(.*-Peak)- (\d+.\d+)kWh@(\d+.\d+) \$(\d+.\d+)', page)
        for tou, kwh, rate, amount in matches:
            data_rows.append([tou, kwh, rate, amount])
    return data_rows    

In [17]:
generation_usage_charges = extract_generation_usage_charges(pages)
generation_usage_charges

[['On-Peak', '138.4', '0.47105', '65.19'],
 ['Mid-Peak', '53.2', '0.24747', '13.17'],
 ['Off-Peak', '1276', '0.08431', '107.58'],
 ['On-Peak', '17.2', '0.46098', '7.93'],
 ['Off-Peak', '97.6', '0.08578', '8.37']]

In [18]:
re.findall(r'(EnergySurcharge) \$(\d+.\d+)', page5)

[('EnergySurcharge', '0.47')]

In [19]:
def extract_energy_surcharge(pages):
    data_rows = []
    for page in pages:
        matches = re.findall(r'(EnergySurcharge) \$(\d+.\d+)', page)
        for charge, amount in matches:
            data_rows.append([charge, amount])
    return data_rows    

In [20]:
extract_energy_surcharge(pages)

[['EnergySurcharge', '0.47']]

In [21]:
def compute_total(pages):
    delivery_usage_charges = extract_delivery_usage_charges(pages)
    basic_charge = extract_basic_charge(pages)
    delivery_surcharges = extract_delivery_surcharges(pages)
    generation_municipal_surchage = extract_generation_municipal_surcharges(pages)
    generation_usage_charges = extract_generation_usage_charges(pages)
    energy_surchage = extract_energy_surcharge(pages)
    return sum([float(row[-1]) for row in delivery_usage_charges]) + \
        sum([float(row[-1]) for row in basic_charge]) + \
        sum([float(row[-1]) for row in delivery_surcharges]) + \
        sum([float(row[-1]) for row in generation_municipal_surcharge]) + \
        sum([float(row[-1]) for row in generation_usage_charges]) + \
        sum([float(row[-1]) for row in energy_surchage])

In [22]:
compute_total(pages)

627.1800000000001

In [32]:
def extract_prepared_date(pages):
    all_text = ' '.join(pages)
    date_patterns = [
        r'\d{12} (\d{2}/\d{2}/\d{2})',
    ]
    for pattern in date_patterns:
        match = re.findall(pattern, all_text)
        if matches:
            return match
    return None

In [34]:
prepared_date = extract_prepared_date(pages)
prepared_date

['07/09/25']

## Refactor with dataclass

In [36]:
from dataclasses import dataclass
from typing import Optional
import csv
from datetime import datetime

@dataclass
class BillLineItem:
   bill_date: str
   category: str
   description: str
   quantity: Optional[float] = None
   unit: Optional[str] = None
   rate: Optional[float] = None
   amount: float = 0.0

   def to_csv_row(self):
       return [self.bill_date, self.category, self.description, self.quantity or '', self.unit or '', self.rate or '', self.amount]