In [3]:
import asyncio
from playwright.async_api import async_playwright

docs_actual = []

async def scrape_energy_all_pages() -> list[str]:
    all_texts = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 1) Navigate & dismiss the popup
        await page.goto(
            "https://advance.lexis.com/container?config=0345494EJAA5ZjE0MDIyYy1kNzZkLTRkNzktYTkxMS04YmJhNjBlNWUwYzYKAFBvZENhdGFsb2e4CaPI4cak6laXLCWyLBO9&crid=71f400f1-686d-4c50-8ecc-7711eca7c5a8",
            wait_until="networkidle"
        )
        await page.click("input#btnagreeterms", timeout=10000)

        # 2) Search “energy”
        await page.fill("textarea#searchTerms", "energy")
        await asyncio.gather(
            page.press("textarea#searchTerms", "Enter"),
            page.wait_for_load_state("networkidle")
        )

        while True:
            # wait for this page’s hits to render
            await page.wait_for_selector("li.usview", timeout=20000)
            rows = page.locator("li.usview")
            count = await rows.count()
            print(f"[+] Found {count} hits on this page")

            for i in range(count):
                row = rows.nth(i)

                # grab the teaser‐page link in <p class="min vis">
                link = row.locator("p.min.vis a").first
                await asyncio.gather(
                    link.click(),
                    page.wait_for_load_state("networkidle")
                )

                # extract the full document text
                await page.wait_for_selector("section#document", timeout=10000)
                full = await page.inner_text("section#document")
                all_texts.append(full)
                docs_actual.append(full)

                # go back to the hits list
                await asyncio.gather(
                    page.go_back(),
                    page.wait_for_load_state("networkidle")
                )

            # now try to click “Next”
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)
            next_btn = page.locator("nav.pagination >> a:has-text('Next')")
            if await next_btn.count() and await next_btn.is_visible():
                print("[+] Clicking Next →")
                await asyncio.gather(
                    next_btn.first.click(),
                    page.wait_for_load_state("networkidle")
                )
                print("[+] Now on the next page!")
                # pause for 1 second
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)

            
            # wait until all old rows are detached
            #await page.wait_for_selector("li.usview", state="detached", timeout=10000)
            # then wait for a fresh set of hits to appear
            #await page.wait_for_selector("li.usview", timeout=20000)

        await browser.close()
    return all_texts

# In your Jupyter cell:
docs = await scrape_energy_all_pages()
print(f"🏁 Scraped {len(docs)} documents total")


[+] Found 10 hits on this page


  page.wait_for_load_state("networkidle", timeout=10000)


[+] Clicking Next →
[+] Now on the next page!


  page.wait_for_load_state("networkidle", timeout=10000)


[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!
[+] Found 10 hits on this page
[+] Clicking Next →
[+] Now on the next page!

CancelledError: 

In [5]:
import pandas as pd

# for a flat list:
H = list(set(docs_actual))
df = pd.DataFrame(H, columns=["Document Text"])
df.to_csv("colorado_energy_data_unfiltered.csv", index=False)

# # or for list-of-lists with headers:
# df = pd.DataFrame(rows[1:], columns=rows[0])
# df.to_csv("output.csv", index=False)
