In [None]:
import re
from typing import Union
import ollama
import asyncio
from playwright.async_api import async_playwright
import pandas as pd

In [168]:
def extract_history_marker(text: str) -> Union[int, str, None]:
    """
    Given a block of statute text, finds:
      1. The first year in the History section as 'Acts YYYY'
      2. If no year, returns 'Reserved' if '[Reserved.]' is present
      3. If no year or Reserved, returns 'Repealed' if '[Repealed.]' is present
      4. Otherwise returns None
    """
    # 1) Try to find Acts YYYY
    m = re.search(r'Acts\s+(\d{4})', text)
    if m:
        return int(m.group(1))

    # 2) Fallbacks
    if re.search(r'\[Reserved\.\]', text, re.IGNORECASE):
        return "Reserved"
    if re.search(r'\[Repealed\.\]', text, re.IGNORECASE):
        return "Repealed"

    return None

def trim_to_body(text: str) -> str:
    """
    Given a statute text that has metadata up top, chop off everything
    before the first occurrence of three consecutive newlines.

    If no triple-newline is found, returns the original text.
    """
    # Method 1: simple string find
    marker = "\n\n\n"
    idx = text.find(marker)
    if idx != -1:
        return text[idx + len(marker):]

    # Fallback: more flexible regex (handles CRLF or extra spaces)
    parts = re.split(r'(?:\r?\n){3,}', text, maxsplit=1)
    return parts[1] if len(parts) > 1 else text

def ollama_chat(text):
    response = ollama.chat(model='mistral', messages=[
      {
        'role': 'user',
        'content': "You are a legislation analyst. "
                "Does the following statute text contain useful information on renewable energy incentives? Please answer in one word yes or no:"
                f"{text}"
            ,
      },
    ])
    return response['message']['content']

In [None]:
docs_actual = []
async def scrape_energy_all_pages() -> list[str]:
    all_texts = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 1) Navigate & dismiss the popup
        await page.goto(
            "https://advance.lexis.com/container?"
            "config=00JAA3ZTU0NTIzYy0zZDEyLTRhYmQtYmRmMS1iMWIxNDgxYWMxZTQK"
            "AFBvZENhdGFsb2cubRW4ifTiwi5vLw6cI1uX&crid=2fa8a237-2181-4f96-b629-ad5ee4a9cc92",
            wait_until="networkidle"
        )
        await page.click("input#btnagreeterms", timeout=10000)

        # 2) Search “energy”
        await page.fill("textarea#searchTerms", "energy")
        await asyncio.gather(
            page.press("textarea#searchTerms", "Enter"),
            page.wait_for_load_state("networkidle")
        )

        while True:
            # wait for this page’s hits to render
            await page.wait_for_selector("li.usview", timeout=20000)
            rows = page.locator("li.usview")
            count = await rows.count()
            print(f"[+] Found {count} hits on this page")

            for i in range(count):
                row = rows.nth(i)

                # grab the teaser‐page link in <p class="min vis">
                link = row.locator("p.min.vis a").first
                await asyncio.gather(
                    link.click(),
                    page.wait_for_load_state("networkidle")
                )

                # extract the full document text
                await page.wait_for_selector("section#document", timeout=10000)
                full = await page.inner_text("section#document")
                all_texts.append(full)
                docs_actual.append(full)

                # go back to the hits list
                await asyncio.gather(
                    page.go_back(),
                    page.wait_for_load_state("networkidle")
                )

            # now try to click “Next”
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)
            next_btn = page.locator("nav.pagination >> a:has-text('Next')")
            if await next_btn.count() and await next_btn.is_visible():
                #print("[+] Clicking Next →")
                await asyncio.gather(
                    next_btn.first.click(),
                    page.wait_for_load_state("networkidle")
                )
                #print("[+] Now on the next page!")
                # pause for 1 second
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)

            
            # wait until all old rows are detached
            #await page.wait_for_selector("li.usview", state="detached", timeout=10000)
            # then wait for a fresh set of hits to appear
            #await page.wait_for_selector("li.usview", timeout=20000)

        await browser.close()
    return all_texts

# In your Jupyter cell:
docs = await scrape_energy_all_pages()
print(f"🏁 Scraped {len(docs)} documents total")


In [243]:
H = list(set(docs_actual))
df = pd.DataFrame(H, columns=["Document Text"])
df.to_csv("arkansas_energy_data_unfiltered.csv", index=False)

In [206]:
df = pd.read_csv('arkansas_energy_data_unfiltered.csv')
df['year'] = df.apply(lambda row: extract_history_marker(row['Document Text']), axis = 1)
df = df.loc[df['year'].isnull() == False,:]
df['Document Text'] = df.apply(lambda row: trim_to_body(row['Document Text']), axis = 1)
# df['relevant'] = df.apply(lambda row: ollama_chat(row['Document Text']), axis = 1)
# df.to_csv("arkansas_energy_data_unfiltered.csv", index=False)

In [208]:
df['location'] = 'Arkansas'
df['year'] = df.apply(lambda row: 1900 + row['year'] if type(row['year']) != str and int(row['year']) < 100 else row['year'], axis = 1)
df.to_csv("arkansas_energy_data.csv", index=False)