In [1]:
from playwright.async_api import async_playwright

async def extract_missouri_statute_links():
    results = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # Use Chromium or Firefox
        page = await browser.new_page()

        # Navigate to Missouri Statutes homepage
        await page.goto("https://revisor.mo.gov/main/Home.aspx", wait_until="domcontentloaded")

        # Wait for the search input field to be visible
        await page.wait_for_selector("input[name='ctl00$tbPhrase1']", timeout=30000)

        # Fill in the search field with 'energy'
        await page.fill("input[name='ctl00$tbPhrase1']", "energy")

        # Submit the search by pressing the Enter key
        await page.press("input[name='ctl00$tbPhrase1']", "Enter")

        # Wait for the relevant links to load (adjusting for statute links)
        await page.wait_for_selector("a[href*='OneSection.aspx'], a[href*='PageSelect.aspx']", timeout=60000)

        # Extract all links that match statute-related patterns
        links = await page.query_selector_all("a[href*='OneSection.aspx'], a[href*='PageSelect.aspx']")

        # Go through each link, get the href (URL), and store it
        for link in links:
            href = await link.get_attribute("href")
            if href:
                # Prepend the base URL to the relative href if necessary
                full_link = f"https://revisor.mo.gov{href}"
                results.append(full_link)

        # Close the browser
        await browser.close()
        return results

# Directly call the function to run in Jupyter Notebook
section_links = await extract_missouri_statute_links()
print(f"Extracted {len(section_links)} links.")


Extracted 227 links.


In [2]:
import pandas as pd

# Assuming section_links is the list of URLs you already have
H = list(set(section_links))

# Create a list of tuples with (Statute, Document Link)
data = []
for link in H:
    # Extract the statute number from the URL (example: section=8.231)
    # This assumes the link format contains "section=xxx" where xxx is the statute number
    statute_number = link.split("section=")[1].split("&")[0]  # Extract statute number
    data.append((statute_number, link))  # Append as a tuple

# Convert to a DataFrame
df = pd.DataFrame(data, columns=["Statute", "Document Link"])
df['state'] = 'Missouri'
# Save to CSV
df.to_csv("missouri_energy_data_unfiltered.csv", index=False)

print(f"Data saved to 'missouri_energy_data_unfiltered.csv'.")


Data saved to 'missouri_energy_data_unfiltered.csv'.
