In [1]:
import re
from typing import Union
import ollama
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed


In [5]:
import re
from typing import Union

def extract_history_marker(text: str) -> Union[int, str, None]:
    """
    Scans the entire `text` for:
      1) Any occurrences of U+2002 (en-space) followed by a 4-digit year, returning the last one.
      2) Otherwise, in the History section:
         a) 'Acts YYYY'
         b) 'Source: L. YYYY'
      3) Otherwise '[Reserved.]' or '[Repealed.]'
      4) Else None
    """
    # 1) look for all “\u2002YYYY” matches and return the last, if any
    u2002_years = re.findall(r'\u2002(\d{4})', text)
    if u2002_years:
        return int(u2002_years[-1])

    # 2) isolate the History section (if present)
    parts = re.split(r'History', text, maxsplit=1, flags=re.IGNORECASE)
    history = parts[1] if len(parts) > 1 else text

    # 3a) look for Acts YYYY
    m = re.search(r'Acts\s+(\d{4})', history)
    if m:
        return int(m.group(1))

    # 3b) look for Source: L. YYYY
    m2 = re.search(r'Source:\s*L\.\s*(\d{4})', history)
    if m2:
        return int(m2.group(1))

    u2002_years = re.findall(r'\u2002(\d{2})', text)
    if u2002_years:
        return int(u2002_years[-1])

    # 2) isolate the History section (if present)
    parts = re.split(r'History', text, maxsplit=1, flags=re.IGNORECASE)
    history = parts[1] if len(parts) > 1 else text

    # 3a) look for Acts YY
    m = re.search(r'Acts\s+(\d{2})', history)
    if m:
        return int(m.group(1))

    # 3b) look for Source: L. YY
    m2 = re.search(r'Source:\s*L\.\s*(\d{2})', history)
    if m2:
        return int(m2.group(1))
    

    # 4) fall back to Reserved/Repealed markers
    if re.search(r'\[Reserved\.\]', history, re.IGNORECASE):
        return "Reserved"
    if re.search(r'\[Repealed\.\]', history, re.IGNORECASE):
        return "Repealed"

    return None



def trim_to_body(text: str) -> str:
    """
    Given a statute text that has metadata up top, chop off everything
    before the first occurrence of three consecutive newlines.

    If no triple-newline is found, returns the original text.
    """
    # Method 1: simple string find
    marker = "\n\n\n"
    idx = text.find(marker)
    if idx != -1:
        return text[idx + len(marker):]

    # Fallback: more flexible regex (handles CRLF or extra spaces)
    parts = re.split(r'(?:\r?\n){3,}', text, maxsplit=1)
    return parts[1] if len(parts) > 1 else text

# def ollama_chat(text):
#     response = ollama.chat(model='mistral', messages=[
#       {
#         'role': 'user',
#         'content': "You are a legislation analyst. "
#                 "Does the following statute text contain useful information on renewable energy incentives? Please answer in one word yes or no:"
#                 f"{text}"
#             ,
#       },
#     ])
#     return response['message']['content']

# from concurrent.futures import ThreadPoolExecutor, as_completed
# import ollama  # make sure your ollama python binding is imported

def ollama_chat(text: str) -> str:
    response = ollama.chat(model='mistral', messages=[
        {
            'role': 'user',
            'content': (
                "You are a legislation analyst. "
                "Does the following statute text contain useful information on renewable energy incentives? "
                "Please answer in one word yes or no:\n\n"
                f"{text}"
            ),
        },
    ])
    return response['message']['content']

def batch_ollama_chat(texts: list[str], max_workers: int = 5) -> list[str]:
    """
    Run ollama_chat in parallel over a list of `texts`, using up to `max_workers` threads.
    Returns a list of responses in the same order as `texts`.
    """
    results: dict[int, str] = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # schedule all the calls, keep track of their index
        future_to_index = {
            executor.submit(ollama_chat, txt): idx
            for idx, txt in enumerate(texts)
        }
        # as each completes, store result in the dict
        for future in as_completed(future_to_index):
            idx = future_to_index[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                print(f"[!] Error in ollama_chat for item {idx}: {e}")
                results[idx] = ""  # fallback

    # return answers in the original order
    return [results[i] for i in range(len(texts))]


In [None]:
docs_actual = []

async def scrape_energy_all_pages() -> list[str]:
    all_texts = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        # 1) Navigate & dismiss the popup
        await page.goto(
            "https://advance.lexis.com/container?config=0345494EJAA5ZjE0MDIyYy1kNzZkLTRkNzktYTkxMS04YmJhNjBlNWUwYzYKAFBvZENhdGFsb2e4CaPI4cak6laXLCWyLBO9&crid=71f400f1-686d-4c50-8ecc-7711eca7c5a8",
            wait_until="networkidle"
        )
        await page.click("input#btnagreeterms", timeout=10000)

        # 2) Search “energy”
        await page.fill("textarea#searchTerms", "energy")
        await asyncio.gather(
            page.press("textarea#searchTerms", "Enter"),
            page.wait_for_load_state("networkidle")
        )

        while True:
            # wait for this page’s hits to render
            await page.wait_for_selector("li.usview", timeout=20000)
            rows = page.locator("li.usview")
            count = await rows.count()
            print(f"[+] Found {count} hits on this page")

            for i in range(count):
                row = rows.nth(i)

                # grab the teaser‐page link in <p class="min vis">
                link = row.locator("p.min.vis a").first
                await asyncio.gather(
                    link.click(),
                    page.wait_for_load_state("networkidle")
                )

                # extract the full document text
                await page.wait_for_selector("section#document", timeout=10000)
                full = await page.inner_text("section#document")
                all_texts.append(full)
                docs_actual.append(full)

                # go back to the hits list
                await asyncio.gather(
                    page.go_back(),
                    page.wait_for_load_state("networkidle")
                )

            # now try to click “Next”
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)
            next_btn = page.locator("nav.pagination >> a:has-text('Next')")
            if await next_btn.count() and await next_btn.is_visible():
                print("[+] Clicking Next →")
                await asyncio.gather(
                    next_btn.first.click(),
                    page.wait_for_load_state("networkidle")
                )
                print("[+] Now on the next page!")
                # pause for 1 second
            await page.wait_for_timeout(1000)
            page.wait_for_load_state("networkidle", timeout=10000)

            
            # wait until all old rows are detached
            #await page.wait_for_selector("li.usview", state="detached", timeout=10000)
            # then wait for a fresh set of hits to appear
            #await page.wait_for_selector("li.usview", timeout=20000)

        await browser.close()
    return all_texts

# In your Jupyter cell:
docs = await scrape_energy_all_pages()
print(f"🏁 Scraped {len(docs)} documents total")


In [5]:
# for a flat list:
H = list(set(docs_actual))
df = pd.DataFrame(H, columns=["Document Text"])
df.to_csv("colorado_energy_data_unfiltered.csv", index=False)

In [7]:
df = pd.read_csv('colorado_energy_data_unfiltered.csv')
df['year'] = df.apply(lambda row: extract_history_marker(row['Document Text']), axis = 1)
df = df.loc[df['year'].isnull() == False,:]
df['Document Text'] = df.apply(lambda row: trim_to_body(row['Document Text']), axis = 1)

In [13]:
df['location'] = 'Colorado'
df['year'] = df.apply(lambda row: 1900 + row['year'] if type(row['year']) != str and int(row['year']) < 100 else row['year'], axis = 1)
df.to_csv("colorado_energy_data.csv", index=False)