In [4]:
# Define regex filter function for exact `nPage=X` match
def exact_npage_filter(n):
    return f"""
    els => els
      .map(el => el.href)
      .filter(href => /[?&]nPage={n}(?:&|$)/.test(href))
    """

# Main scraping function
async def extract_connecticut_energy_statutes():
    results = []
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://search.cga.state.ct.us/r/statute/", wait_until="domcontentloaded")

        await page.fill("input#request", "energy")
        await page.press("input#request", "Enter")
        await page.wait_for_selector("table.footable tbody tr")

        page_num = 1
        seen_pages = set()

        while True:
            print(f"[INFO] Processing page {page_num}")

            # ✅ Get all matching <td> elements and extract trimmed text
            rows = await page.query_selector_all("table.footable tbody tr td.footable-visible")
            for row in rows:
                text = (await row.inner_text()).strip()
                match = re.match(r"^(.*?\..*?\.)", text)
                if match:
                    results.append(match.group(1))

            # ✅ Look for next page link using fast JS filtering
            candidate_links = await page.eval_on_selector_all(
                "a[href*='nPage=']", exact_npage_filter(page_num + 1)
            )
            if not candidate_links:
                break

            # ✅ Navigate without waiting for domcontentloaded again
            await page.goto(candidate_links[0], wait_until="networkidle")
            page_num += 1

        await browser.close()
        return results

# This cell prepares the function but does not execute it automatically. You can run this in a Jupyter cell like:
section_titles = await extract_connecticut_energy_statutes()
print(section_titles[:5])
print(f"Total extracted titles: {len(section_titles)}")


[INFO] Processing page 1
[INFO] Processing page 2
[INFO] Processing page 3
[INFO] Processing page 4
[INFO] Processing page 5
[INFO] Processing page 6
[INFO] Processing page 7
[INFO] Processing page 8
[INFO] Processing page 9
[INFO] Processing page 10
[INFO] Processing page 11
[INFO] Processing page 12
[INFO] Processing page 13
[INFO] Processing page 14
[INFO] Processing page 15
[INFO] Processing page 16
[INFO] Processing page 17
[INFO] Processing page 18
[INFO] Processing page 19
[INFO] Processing page 20
[INFO] Processing page 21
[INFO] Processing page 22
[INFO] Processing page 23
[INFO] Processing page 24
[INFO] Processing page 25
[INFO] Processing page 26
[INFO] Processing page 27
[INFO] Processing page 28
[INFO] Processing page 29
[INFO] Processing page 30
[INFO] Processing page 31
[INFO] Processing page 32
[INFO] Processing page 33
[INFO] Processing page 34
[INFO] Processing page 35
[INFO] Processing page 36
['Sec. 1-24.', 'Sec. 1-84.', 'Sec. 1-206.', 'Sec. 1-217.', 'Sec. 2-20a.']

In [118]:
import asyncio
import re
from typing import Optional, Tuple, List
from playwright.async_api import async_playwright

def parse_section(section: str) -> Optional[str]:
    """
    Extract the core part from section strings like:
      'Sec. 1-24.'    -> '1-24'
      'Section 1-206' -> '1-206'
      'Sec. 16a-244b' -> '16a-244b'
    """
    pattern = r'(\d+[A-Za-z]*(?:-\d+[A-Za-z]*)*)'
    m = re.search(pattern, section)
    return m.group(1) if m else None

def normalize(sec: str) -> List:
    """Split and normalize '16a-244b' → [16, 'a', 244, 'b'] for easy comparison."""
    parts = re.split(r'([A-Za-z]+|\d+)', sec)
    return [int(p) if p.isdigit() else p for p in parts if p]

def denormalize(parts: List) -> str:
    """Re-build section string from normalized parts."""
    return ''.join(str(p) for p in parts)

def section_in_range(target: str, start: str, end: str) -> bool:
    """Return True if normalized(start) ≤ normalized(target) ≤ normalized(end)."""
    t, s, e = normalize(target), normalize(start), normalize(end)
    return s <= t <= e
async def fetch_ct_statute_text(section: str) -> Tuple[Optional[str], List[str]]:
    """
    Given 'Sec. 1-24.', finds the appropriate chapter page and scrapes the
    paragraph lines for that section (up through the <p class="source-first">).
    Returns (final_url, [lines]) or (None, []) on failure.
    """
    parsed = parse_section(section)
    if not parsed:
        print(f"[ERROR] Could not parse {section!r}.")
        return None, []

    target = denormalize(normalize(parsed))

    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()

        # ---- 1) Top‐level TOC: find which Title contains our section ----
        await page.goto("https://www.cga.ct.gov/current/pub/titles.htm",
                        wait_until="domcontentloaded")
        top_href = None
        for row in await page.query_selector_all("table[style*='width:80%'] td.left_38pct"):
            a    = await row.query_selector("a")
            span = await row.query_selector("span.toc_rng_chaps")
            if not a or not span:
                continue
            txt = await span.inner_text()
            m   = re.search(r'\(?Secs?\. ([\d\w\-]+)\s+to\s+([\d\w\-]+)\)?', txt)
            if m and section_in_range(parsed, m.group(1), m.group(2)):
                top_href = await a.get_attribute("href")
                print(f"[MATCH] Top‐level range {m.group(1)}–{m.group(2)} → {top_href}")
                break

        if not top_href:
            print(f"[WARN] No top‐level title for {section}.")
            await browser.close()
            return None, []

        # ---- 2) Load that Title, scan its chapter links ----
        title_url = f"https://www.cga.ct.gov/current/pub/{top_href}"
        await page.goto(title_url, wait_until="domcontentloaded")

        final_url = None
    
        for td in await page.query_selector_all("td.left_40pct"):
            a    = await td.query_selector("a")
            span = await td.query_selector("span.toc_rng_secs")
            if not a or not span:
                continue
    
            # CORRECTED LINE:
            txt = (await span.inner_text()).strip()
    
            # first try “X to Y”
            m = re.search(r'\(?Secs?\. ([\d\w\-]+)\s+to\s+([\d\w\-]+)\)?', txt)
            if m:
                start, end = m.groups()
            else:
                # fallback to “Sec. Z”
                m2 = re.search(r'\(?Sec\. ([\d\w\-]+)\)?', txt)
                if m2:
                    start = end = m2.group(1)
                else:
                    continue

            print(f"[CHECK] Chapter link covers {start}–{end} for {section}.")
            if section_in_range(target, start, end):
                href      = await a.get_attribute("href")
                final_url = f"https://www.cga.ct.gov/current/pub/{href}"
                print(f"[FOUND] In‐chapter {start}–{end} → {final_url}")
                break

        if not final_url:
            print(f"[WARN] No chapter link matched {section}.")
            await browser.close()
            return title_url, []

        # ---- 3) Scrape paragraph up to and including <p class="source-first"> ----
        await page.goto(final_url, wait_until="domcontentloaded")
        print(f"[INFO] Extracting sec_{target}…")
        try:
            lines = await page.eval_on_selector(
                f"span#sec_{target}",
                """
                span => {
                  const out = [];
                  const firstP = span.closest('p');
                  if (!firstP) return out;
                  out.push(firstP.innerText.trim());
                  let p = firstP.nextElementSibling;
                  while (p) {
                    out.push(p.innerText.trim());
                    if (p.classList.contains('source-first')) break;
                    p = p.nextElementSibling;
                  }
                  return out;
                }
                """
            )
        except Exception as e:
            print(f"[ERROR] Failed to extract sec_{target}: {e}")
            lines = []

        await browser.close()
        return final_url, lines

In [None]:
texts = []
for section in section_titles:
    text = await fetch_ct_statute_text(section)
    texts.append(text)

[MATCH] Top‐level range 1-1–1-500 → title_01.htm
[CHECK] Chapter link covers 1-1–1-3b for Sec. 1-24..
[CHECK] Chapter link covers 1-4–1-6 for Sec. 1-24..
[CHECK] Chapter link covers 1-7–1-21l for Sec. 1-24..
[CHECK] Chapter link covers 1-22–1-25 for Sec. 1-24..
[FOUND] In‐chapter 1-22–1-25 → https://www.cga.ct.gov/current/pub/chap_004.htm
[INFO] Extracting sec_1-24…
[MATCH] Top‐level range 1-1–1-500 → title_01.htm
[CHECK] Chapter link covers 1-1–1-3b for Sec. 1-84..
[CHECK] Chapter link covers 1-4–1-6 for Sec. 1-84..
[CHECK] Chapter link covers 1-7–1-21l for Sec. 1-84..
[CHECK] Chapter link covers 1-22–1-25 for Sec. 1-84..
[CHECK] Chapter link covers 1-26–1-27 for Sec. 1-84..
[CHECK] Chapter link covers 1-28–1-41 for Sec. 1-84..
[CHECK] Chapter link covers 1-42–1-56 for Sec. 1-84..
[CHECK] Chapter link covers 1-56a–1-56g for Sec. 1-84..
[CHECK] Chapter link covers 1-56h–1-56q for Sec. 1-84..
[CHECK] Chapter link covers 1-56r–1-56r for Sec. 1-84..
[CHECK] Chapter link covers 1-57–1-65z 

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
  funcs = list(self._events.get(event, OrderedDict()).values())


[MATCH] Top‐level range 7-1–7-623 → title_07.htm
[CHECK] Chapter link covers 7-1–7-9d for Sec. 7-121d..
[CHECK] Chapter link covers 7-10–7-15 for Sec. 7-121d..
[CHECK] Chapter link covers 7-16–7-35z for Sec. 7-121d..
[CHECK] Chapter link covers 7-35aa–7-35gg for Sec. 7-121d..
[CHECK] Chapter link covers 7-36–7-78 for Sec. 7-121d..
[CHECK] Chapter link covers 7-79–7-85 for Sec. 7-121d..
[CHECK] Chapter link covers 7-86–7-97 for Sec. 7-121d..
[CHECK] Chapter link covers 7-98–7-100j for Sec. 7-121d..
[CHECK] Chapter link covers 7-100k–7-100z for Sec. 7-121d..
[CHECK] Chapter link covers 7-101–7-147 for Sec. 7-121d..
[FOUND] In‐chapter 7-101–7-147 → https://www.cga.ct.gov/current/pub/chap_097.htm
[INFO] Extracting sec_7-121d…
[MATCH] Top‐level range 7-1–7-623 → title_07.htm
[CHECK] Chapter link covers 7-1–7-9d for Sec. 7-121f..
[CHECK] Chapter link covers 7-10–7-15 for Sec. 7-121f..
[CHECK] Chapter link covers 7-16–7-35z for Sec. 7-121f..
[CHECK] Chapter link covers 7-35aa–7-35gg for Sec. 