In [5]:
from playwright.async_api import async_playwright

async def extract_michigan_energy_links():
    results = []
    base_url = "https://www.legislature.mi.gov"  # Define the base URL

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # Use headless=False for debugging
        page = await browser.new_page()

        # Step 1: Go to the Michigan Legislature search page
        await page.goto("https://www.legislature.mi.gov/Laws/MCLSearch", wait_until="load")
        
        # Wait for the search input field to be available
        await page.wait_for_selector("input[name='contentFullText']", timeout=60000)

        # Fill the search input field with the term "energy"
        await page.fill("input[name='contentFullText']", "energy")
        
        # Press Enter to submit the search
        await page.press("input[name='contentFullText']", "Enter")

        # Step 2: Wait for the result list to load and be visible
        await page.wait_for_selector("tbody", timeout=10000)  # The table body where results are

        # Step 3: Extract the links from the result items
        while True:
            # Extract links from the current page
            links = await page.query_selector_all("tbody tr td a")

            # Collect the URLs and any additional information if needed
            for link in links:
                href = await link.get_attribute("href")
                text = await link.inner_text()

                # Check if the href is a relative URL and prepend the base URL
                if href and href.startswith("/"):
                    href = base_url + href

                results.append((text.strip(), href))

            # Check if there's a "Next" button for pagination
            next_button = await page.query_selector("a[aria-label='Next']")
            if next_button:
                # Click the "Next" button to go to the next page
                await next_button.click()
                await page.wait_for_timeout(3000)  # Wait for the page to load
            else:
                # No "Next" button found, meaning we've reached the last page
                break

        await browser.close()
        return results

# To run the function in Jupyter directly:
section_links = await extract_michigan_energy_links()
print(section_links[:5])  # Print first 5 links
print(f"Total links: {len(section_links)}")


[('Act 38 of 1979', 'https://www.legislature.mi.gov/Home/GetObject?objectName=mcl-Act-38-of-1979&highlight=energy&queryID=170095218'), ('Act 191 of 1982', 'https://www.legislature.mi.gov/Home/GetObject?objectName=mcl-Act-191-of-1982&highlight=energy&queryID=170095218'), ('Act 625 of 2012', 'https://www.legislature.mi.gov/Home/GetObject?objectName=mcl-Act-625-of-2012&highlight=energy&queryID=170095218'), ('Act 230 of 1972', 'https://www.legislature.mi.gov/Home/GetObject?objectName=mcl-Act-230-of-1972&highlight=energy&queryID=170095218'), ('Act 593 of 2002', 'https://www.legislature.mi.gov/Home/GetObject?objectName=mcl-Act-593-of-2002&highlight=energy&queryID=170095218')]
Total links: 583


In [6]:
import pandas as pd

# for a flat list:
H = list(set(section_links))
df = pd.DataFrame(H, columns=["Statute","Document Link"])
df['state'] = 'Michigan'
df.to_csv("michigan_energy_data_unfiltered.csv", index=False)