In [112]:
import asyncio
import re
from playwright.async_api import async_playwright

# Helper to extract formatted links from a single run
async def extract_energy_links_from_page_range(start_page, page_limit=50):
    results = []
    seen = set()
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto("https://leginfo.legislature.ca.gov/faces/codes.xhtml", wait_until="domcontentloaded")

        # Navigate to Text Search tab
        await page.click("text=Text Search")
        await page.wait_for_selector("input#codeSearchForm\\:or_one")

        # Fill out the form
        await page.fill("input#codeSearchForm\\:or_one", "energy")
        await page.check("input[type='checkbox']", strict=False)
        await page.click("input[type='submit'][value='Search']")
        await page.wait_for_selector("table.table_main")

        # Jump to start_page if necessary
        # Jump to start page if not 1
        if start_page > 1:
            await page.fill("input#datanavform\\:go_to_page", str(start_page))
            await page.click("input[name='datanavform:gotopage']")
            await page.wait_for_selector("table.table_main")
            await asyncio.sleep(1)

        # Extract for page_limit pages
        current_page = start_page
        while current_page < start_page + page_limit:
            rows = await page.query_selector_all("table.table_main a[onclick]")
            for a in rows:
                onclick = await a.get_attribute("onclick")
                if onclick and "callRedirect" in onclick:
                    args = re.findall(r"'([^']*)'", onclick)
                    if len(args) >= 3:
                        code, section = args[0], args[2]
                        article = args[-1]  # use last value
                        full_url = f"https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode={code}&sectionNum={section}&article={article}&highlight=true&keyword=energy"
                        if full_url not in seen:
                            results.append(full_url)
                            seen.add(full_url)

            next_button = await page.query_selector("input[type='submit'][value='Next 10 Sections >>']")
            if not next_button or await next_button.is_disabled():
                break
            await next_button.click()
            await page.wait_for_selector("table.table_main")
            await page.wait_for_timeout(1000)
            current_page += 1

        await browser.close()
        return results, current_page

# Repeatedly call until end is reached
async def loop_through_all_pages():
    all_links = []
    start = 1
    while True:
        print(f"🔄 Starting from page {start}")
        links, last_page = await extract_energy_links_from_page_range(start, page_limit=50)
        all_links.extend(links)
        print(f"✅ Pages {start} to {last_page - 1}: {len(links)} new links")

        if start == last_page:
            return all_links
            break
        start = last_page
    return all_links

# To run in notebook:
section_links = await loop_through_all_pages()
print(section_links[:5])
print(f"Total links: {len(section_links)}")


🔄 Starting from page 1
✅ Pages 1 to 50: 500 new links
🔄 Starting from page 51
✅ Pages 51 to 100: 499 new links
🔄 Starting from page 101
✅ Pages 101 to 150: 493 new links
🔄 Starting from page 151
✅ Pages 151 to 155: 58 new links
🔄 Starting from page 156
✅ Pages 156 to 155: 8 new links
['https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=FIN&sectionNum=32201.&article=3.&highlight=true&keyword=energy', 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=LAB&sectionNum=1720.6.&article=1.&highlight=true&keyword=energy', 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=FIN&sectionNum=32209.&article=3.&highlight=true&keyword=energy', 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PCC&sectionNum=10709.&article=1.&highlight=true&keyword=energy', 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PUC&sectionNum=388.&article=9.&highlight=true&keyword=energy']
Total

In [128]:
len(section_links)

1558

In [130]:
section_links

['https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=FIN&sectionNum=32201.&article=3.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=LAB&sectionNum=1720.6.&article=1.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=FIN&sectionNum=32209.&article=3.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PCC&sectionNum=10709.&article=1.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PUC&sectionNum=388.&article=9.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PUC&sectionNum=2801.&article=1.&highlight=true&keyword=energy',
 'https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=SHC&sectionNum=157.1.&article=3.7.&highlight=true&keyword=energy',

In [139]:
import pandas as pd

# for a flat list:
H = list(set(section_links))
df = pd.DataFrame(H, columns=["Link_to_Statute"])
df.to_csv("california_energy_data_unfiltered.csv", index=False)

# # or for list-of-lists with headers:
# df = pd.DataFrame(rows[1:], columns=rows[0])
# df.to_csv("output.csv", index=False)
