In [19]:
import re
from typing import Union
import ollama
import asyncio
from playwright.async_api import async_playwright
import pandas as pd

In [58]:
def extract_history_marker(text: str) -> Union[int, str, None]:
    """
    Given a block of statute text, finds:
      1. The first year in the History section as 'Acts YYYY'
      2. If no year, returns 'Reserved' if '[Reserved.]' is present
      3. If no year or Reserved, returns 'Repealed' if '[Repealed.]' is present
      4. Otherwise returns None
    """
    # 1) Try to find Acts YYYY
    """
    Returns the first Ga. L. year, or None if none found.
    """
    m = re.search(r'Ga\.\s*L\.\s*(\d{4})', text)
    return int(m.group(1)) if m else None
    
    # 2) Fallbacks
    if re.search(r'\[Reserved\.\]', text, re.IGNORECASE):
        return "Reserved"
    if re.search(r'\[Repealed\.\]', text, re.IGNORECASE):
        return "Repealed"
    
    return None

def trim_to_body(text: str) -> str:
    """
    Given a statute text that has metadata up top, chop off everything
    before the first occurrence of three consecutive newlines.

    If no triple-newline is found, returns the original text.
    """
    # Method 1: simple string find
    marker = "\n\n\n"
    idx = text.find(marker)
    if idx != -1:
        return text[idx + len(marker):]

    # Fallback: more flexible regex (handles CRLF or extra spaces)
    parts = re.split(r'(?:\r?\n){3,}', text, maxsplit=1)
    return parts[1] if len(parts) > 1 else text

In [6]:
import asyncio
import csv
from playwright.async_api import async_playwright

docs_actual = []

async def scrape_energy_all_pages() -> list[str]:
    all_texts: list[str] = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 1) Navigate & dismiss popup if needed
        await page.goto(
            "https://advance.lexis.com/container?config=00JAAzZDgzNzU2ZC05MDA0LTRmMDItYjkzMS0xOGY3MjE3OWNlODIKAFBvZENhdGFsb2fcIFfJnJ2IC8XZi1AYM4Ne&crid=d1ef0e4a-f560-4a3f-bca1-09d66269998b",
            wait_until="networkidle"
        )
        # (uncomment if you get a popup)
        # await page.click("input#btnagreeterms", timeout=10000)

        # 2) Search “energy”
        await page.wait_for_selector("textarea#searchTerms", timeout=10000)
        await page.fill("textarea#searchTerms", "energy")
        await asyncio.gather(
            page.press("textarea#searchTerms", "Enter"),
            page.wait_for_load_state("networkidle")
        )

        while True:
            # wait for this page’s hits to render
            await page.wait_for_selector("li.usview", timeout=20000)
            rows = page.locator("li.usview")
            count = await rows.count()
            print(f"[+] Found {count} hits on this page")

            for i in range(count):
                row = rows.nth(i)
                try:
                    print(f"    → Opening hit {i+1}/{count}")
                    teaser = row.locator("p.min.vis a").first
                    await asyncio.gather(
                        teaser.click(),
                        page.wait_for_load_state("networkidle", timeout=10000)
                    )

                    # extract the full document text
                    await page.wait_for_selector("section#document", timeout=10000)
                    full = await page.inner_text("section#document")
                    all_texts.append(full)
                    docs_actual.append(full)
                    await asyncio.gather(
                            page.go_back(),
                            page.wait_for_load_state("networkidle")
                        )

                except Exception as e:
                    print(f"    [ERROR] Hit {i+1} failed: {e}")
                    all_texts.append("")  # record an empty placeholder
                    docs_actual.append("")

                # finally:
                #     # always attempt to go back
                #     try:
                #         await asyncio.gather(
                #             page.go_back(),
                #             page.wait_for_load_state("networkidle")
                #         )
                #     except:
                #         pass
                #     await asyncio.sleep(1)

            # --- try to advance to next page ---
            await asyncio.sleep(1)
            next_btn = page.locator("nav.pagination >> a[data-action='nextpage']").first
            has_next = await next_btn.count() > 0 and await next_btn.is_visible()
            print(f"[>] Next‐button visible? {has_next}")

            if not has_next:
                print("[+] No more pages; done.")
                break

            print("[+] Clicking Next →")
            await asyncio.gather(
                next_btn.click(),
                page.wait_for_load_state("networkidle")
            )

        await browser.close()
    return all_texts

# In your Jupyter cell:
docs = await scrape_energy_all_pages()

[+] Found 10 hits on this page
    → Opening hit 1/10
    → Opening hit 2/10
    → Opening hit 3/10
    → Opening hit 4/10
    → Opening hit 5/10
    → Opening hit 6/10
    → Opening hit 7/10
    → Opening hit 8/10
    → Opening hit 9/10
    → Opening hit 10/10
[>] Next‐button visible? True
[+] Clicking Next →
[+] Found 10 hits on this page
    → Opening hit 1/10
    → Opening hit 2/10
    → Opening hit 3/10
    → Opening hit 4/10
    → Opening hit 5/10
    → Opening hit 6/10
    → Opening hit 7/10
    → Opening hit 8/10
    → Opening hit 9/10
    → Opening hit 10/10
[>] Next‐button visible? True
[+] Clicking Next →
[+] Found 10 hits on this page
    → Opening hit 1/10
    → Opening hit 2/10
    → Opening hit 3/10
    → Opening hit 4/10
    → Opening hit 5/10
    → Opening hit 6/10
    → Opening hit 7/10
    → Opening hit 8/10
    → Opening hit 9/10
    → Opening hit 10/10
[>] Next‐button visible? True
[+] Clicking Next →
[+] Found 10 hits on this page
    → Opening hit 1/10
    → Open

CancelledError: 

In [11]:
H = list(set(docs_actual))
df = pd.DataFrame(H, columns=["Document Text"])
df.to_csv("georgia_energy_data_unfiltered.csv", index=False)

In [66]:
df = pd.read_csv('georgia_energy_data_unfiltered.csv')
df = df.loc[df['Document Text'].isnull() == False, :]
df['year'] = df.apply(lambda row: extract_history_marker(row['Document Text']), axis = 1)
df = df.loc[df['year'].isnull() == False,:]
df['Document Text'] = df.apply(lambda row: trim_to_body(row['Document Text']), axis = 1)

In [68]:
df['location'] = 'Georgia'
df['year'] = df.apply(lambda row: 1900 + row['year'] if type(row['year']) != str and int(row['year']) < 100 else row['year'], axis = 1)
df.to_csv("georgia_energy_data.csv", index=False)