In [None]:
#ASIC Info Sheets

!apt-get -qq update
!apt-get -qq install -y \
    libatk1.0-0 \
    libatk-bridge2.0-0 \
    libatspi2.0-0 \
    libcups2 \
    libdrm2 \
    libgbm1 \
    libgtk-3-0 \
    libnss3 \
    libnspr4 \
    libx11-xcb1 \
    libxcomposite1 \
    libxdamage1 \
    libxrandr2 \
    libxshmfence1 \
    libasound2 \
    fonts-liberation


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libatspi2.0-0:amd64.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../00-libatspi2.0-0_2.44.0-3_amd64.deb ...
Unpacking libatspi2.0-0:amd64 (2.44.0-3) ...
Selecting previously unselected package libxtst6:amd64.
Preparing to unpack .../01-libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package session-migration.
Preparing to unpack .../02-session-migration_0.3.6_amd64.deb ...
Unpacking session-migration (0.3.6) ...
Selecting previously unselected package gsettings-desktop-schemas.
Preparing to unpack .../03-gsettings-desktop-schemas_42.0-1ubuntu1_all.deb ...
Unpacking gsettings-desktop-schemas (42.0-1ubuntu1) ...
Selecting previously unselected

In [None]:
#Cell 1

!pip -q install playwright==1.47.0 beautifulsoup4 pandas tqdm nest_asyncio lxml
!playwright install chromium


In [21]:
#cell 2

import re
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright

nest_asyncio.apply()

LISTING_URL = "https://www.asic.gov.au/regulatory-resources/regulatory-resources-search/?filter=document&type=information+sheet&superseded=false&sort=lastUpdated"

async def scrape_info_sheet_urls():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--no-sandbox","--disable-setuid-sandbox","--disable-dev-shm-usage"],
        )
        page = await browser.new_page()
        await page.goto(LISTING_URL, wait_until="domcontentloaded")
        await page.wait_for_timeout(1500)

        # Cookie banner (if present)
        for name in ["Accept all", "Accept", "I agree"]:
            try:
                await page.get_by_role("button", name=re.compile(name, re.I)).click(timeout=1500)
                break
            except:
                pass

        async def result_count():
            return await page.evaluate("""
                () => {
                  const main = document.querySelector('main') || document.body;
                  const txt = main.innerText || '';
                  const m = txt.match(/\\bInformation\\s+sheet\\b/gi);
                  return m ? m.length : 0;
                }
            """)

        expected = await page.evaluate("""
            () => {
              const main = document.querySelector('main') || document.body;
              const txt = main.innerText || '';
              const m = txt.match(/Found\\s+(\\d+)\\s+information\\s+sheets/i);
              return m ? parseInt(m[1], 10) : null;
            }
        """)
        print("Expected (from page):", expected)

        load_more = page.get_by_role("button", name=re.compile(r"load more", re.I))

        prev = await result_count()
        print("Initial visible results:", prev)

        stagnant_rounds = 0
        for i in range(200):
            if expected and prev >= expected:
                print("Reached expected count.")
                break

            if not (await load_more.is_visible()) or not (await load_more.is_enabled()):
                print("Load more not visible/enabled — stopping.")
                break

            await load_more.scroll_into_view_if_needed()
            await page.wait_for_timeout(300)

            await load_more.click()
            await page.wait_for_timeout(1200)

            now = await result_count()
            print(f"After click {i+1}: {now}")

            if now <= prev:
                stagnant_rounds += 1
                if stagnant_rounds >= 3:
                    print("No growth after multiple attempts — stopping.")
                    break
            else:
                stagnant_rounds = 0
                prev = now

        extracted = await page.evaluate("""
            () => {
              const abs = (u) => {
                try { return new URL(u, location.origin).href; } catch(e) { return null; }
              };

              const main = document.querySelector('main') || document.body;

              // pick the element with the most "Information sheet" occurrences
              const candidates = Array.from(main.querySelectorAll('div, section, ul, ol')).slice(0, 2000);
              let best = main;
              let bestScore = 0;

              for (const el of candidates) {
                const txt = (el.innerText || '');
                const m = txt.match(/\\bInformation\\s+sheet\\b/gi);
                const score = m ? m.length : 0;
                if (score > bestScore) { bestScore = score; best = el; }
              }

              const urls = new Set();

              // anchors in the results container
              best.querySelectorAll('a[href]').forEach(a => {
                const u = abs(a.getAttribute('href'));
                if (u) urls.add(u);
              });

              // clickable cards
              best.querySelectorAll('[data-href],[data-url]').forEach(el => {
                const u = abs(el.getAttribute('data-href') || el.getAttribute('data-url'));
                if (u) urls.add(u);
              });

              return {
                bestScore,
                urls: Array.from(urls)
              };
            }
        """)

        raw_urls = sorted(set(extracted["urls"]))

        # Minimal filtering only: keep ASIC pages, drop the search page itself
        urls = [
            u for u in raw_urls
            if u.startswith("https://www.asic.gov.au/")
            and "regulatory-resources-search" not in u
        ]
        urls = sorted(set(urls))

        # Diagnostics: show the ones that your previous filter would have excluded
        excluded_by_old_filter = [u for u in urls if "/regulatory-resources/" not in u]
        print("Heuristic results-container score:", extracted["bestScore"])
        print("Raw URLs found in container:", len(raw_urls))
        print("URLs kept (minimal filter):", len(urls))
        print("Would be excluded by old '/regulatory-resources/' filter:", len(excluded_by_old_filter))
        print("Sample excluded (first 20):", excluded_by_old_filter[:20])
        print("First 10 kept:", urls[:10])

        await browser.close()
        return urls

all_candidate_urls = asyncio.run(scrape_info_sheet_urls())
len(all_candidate_urls), all_candidate_urls[:10]


Expected (from page): 201
Initial visible results: 20
After click 1: 30
After click 2: 40
After click 3: 50
After click 4: 60
After click 5: 70
After click 6: 80
After click 7: 90
After click 8: 100
After click 9: 110
After click 10: 120
After click 11: 130
After click 12: 140
After click 13: 150
After click 14: 160
After click 15: 170
After click 16: 180
After click 17: 190
After click 18: 200
After click 19: 201
Reached expected count.
Heuristic results-container score: 201
Raw URLs found in container: 201
URLs kept (minimal filter): 201
Would be excluded by old '/regulatory-resources/' filter: 93
Sample excluded (first 20): ['https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/about-the-court-enforceable-undertakings-register/', 'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-approach-to-enforcement/', 'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-approach-to-involvement-in-private-court-proceedings/'

(201,
 ['https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/about-the-court-enforceable-undertakings-register/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-approach-to-enforcement/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-approach-to-involvement-in-private-court-proceedings/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-approach-to-involvement-in-private-court-proceedings/providing-information-and-documents-to-private-litigants/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-compulsory-information-gathering-powers/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/asic-s-document-production-guidelines/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement/claims-of-legal-professional-privilege/',
  'https://www.asic.gov.au/about-asic/asic-investigations-and-enforcement

In [22]:
# Cell 3 — Visit each URL, confirm it’s an INFO sheet, extract metadata (incl issued/updated dates),
# save PDF, and write CSV.

import os
import re
import json
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from playwright.async_api import async_playwright

OUT_DIR = "asic_information_sheets"
PDF_DIR = os.path.join(OUT_DIR, "pdf")
os.makedirs(PDF_DIR, exist_ok=True)

def safe_filename(s: str, max_len: int = 140) -> str:
    s = (s or "").strip()
    s = re.sub(r"[^\w\s\.-]", "", s, flags=re.UNICODE).strip()
    s = re.sub(r"\s+", "_", s)
    return s[:max_len].strip("_") if s else "untitled"

def extract_ld_dates(html: str):
    """Try to pull datePublished/dateModified from JSON-LD if present."""
    soup = BeautifulSoup(html, "lxml")
    objs = []
    for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
        txt = script.get_text(strip=True)
        if not txt:
            continue
        try:
            data = json.loads(txt)
            objs.append(data)
        except:
            continue

    def walk(x):
        if isinstance(x, dict):
            yield x
            for v in x.values():
                yield from walk(v)
        elif isinstance(x, list):
            for i in x:
                yield from walk(i)

    date_published = None
    date_modified = None
    for obj in walk(objs):
        if not isinstance(obj, dict):
            continue
        if not date_published and obj.get("datePublished"):
            date_published = obj.get("datePublished")
        if not date_modified and obj.get("dateModified"):
            date_modified = obj.get("dateModified")

    return date_published, date_modified

def extract_info_number_title_desc(html: str):
    soup = BeautifulSoup(html, "lxml")

    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else None

    main = soup.find("main")
    main_text = main.get_text("\n", strip=True) if main else soup.get_text("\n", strip=True)

    # INFO number patterns
    info_no = None
    for pat in [
        r"\bInformation\s*Sheet\s*(\d{1,4})\b",
        r"\bINFO\s*[- ]?\s*(\d{1,4})\b",
        r"\binformation\s*sheet\s*[–-]\s*(\d{1,4})\b",
    ]:
        m = re.search(pat, main_text, flags=re.I)
        if m:
            info_no = int(m.group(1))
            break

    # Short description: first paragraph in <main>
    desc = None
    if main:
        p = main.find("p")
        if p:
            desc = p.get_text(" ", strip=True)

    return info_no, title, desc, main_text

def extract_issued_updated_dates_from_text(text: str):
    """
    ASIC INFO pages commonly include:
      - "This information sheet was issued in June 2022."
      - "This information sheet was updated in March 2025."
    We return (issued, updated) as strings (e.g., "June 2022") when found.
    """
    if not text:
        return None, None

    # Accept: "June 2022" or "1 June 2022"
    date_pat = r"([A-Za-z]+\s+\d{4}|\d{1,2}\s+[A-Za-z]+\s+\d{4})"

    issued = None
    updated = None

    m = re.search(rf"This information sheet was issued (?:in|on)\s+{date_pat}\.", text, re.I)
    if m:
        issued = m.group(1)

    m = re.search(rf"This information sheet was updated (?:in|on)\s+{date_pat}\.", text, re.I)
    if m:
        updated = m.group(1)

    return issued, updated

async def download_info_sheets(urls):
    rows = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-dev-shm-usage",
            ],
        )
        context = await browser.new_context()
        page = await context.new_page()

        for url in tqdm(urls):
            try:
                await page.goto(url, wait_until="networkidle", timeout=60000)
                html = await page.content()

                info_no, title, desc, main_text = extract_info_number_title_desc(html)

                # Filter: keep only pages that truly look like an INFO sheet
                if info_no is None or not title:
                    continue

                # Dates: prefer issued/updated text; fall back to JSON-LD if present
                issued, updated = extract_issued_updated_dates_from_text(main_text)
                ld_published, ld_modified = extract_ld_dates(html)

                date_published = issued or ld_published or updated  # fallback: updated if issued missing
                date_modified = updated or ld_modified

                # Print to PDF
                await page.emulate_media(media="print")
                fname = f"INFO-{info_no:03d}_{safe_filename(title)}.pdf"
                pdf_path = os.path.join(PDF_DIR, fname)

                await page.pdf(
                    path=pdf_path,
                    format="A4",
                    print_background=True,
                    margin={"top": "12mm", "bottom": "12mm", "left": "12mm", "right": "12mm"},
                )

                rows.append({
                    "info_number": info_no,
                    "title": title,
                    "url": url,
                    "date_published": date_published,
                    "date_modified": date_modified,
                    "short_description": desc,
                    "pdf_filename": fname,
                })

                await asyncio.sleep(0.15)

            except Exception as e:
                rows.append({
                    "info_number": None,
                    "title": None,
                    "url": url,
                    "date_published": None,
                    "date_modified": None,
                    "short_description": None,
                    "pdf_filename": None,
                    "error": str(e),
                })
                continue

        await browser.close()

    df = pd.DataFrame(rows)

    df_ok = df[df["info_number"].notna()].copy()
    df_ok["info_number"] = df_ok["info_number"].astype(int)
    df_ok = df_ok.sort_values(["info_number", "title"], ascending=[True, True])

    csv_path = os.path.join(OUT_DIR, "asic_information_sheets_index.csv")
    df_ok.to_csv(csv_path, index=False)

    return df_ok, csv_path

# Run it (expects all_candidate_urls created in Cell 2)
df_index, csv_path = asyncio.run(download_info_sheets(all_candidate_urls))
df_index.head(), df_index.shape, csv_path


100%|██████████| 201/201 [05:41<00:00,  1.70s/it]


(     info_number                                              title  \
 42             1                            Administrative hearings   
 44             9                        ASIC decisions: Your rights   
 166           14      Bankruptcy and personal insolvency agreements   
 118           24                           Deeds of cross-guarantee   
 0             28  About the court enforceable undertakings register   
 
                                                    url date_published  \
 42   https://www.asic.gov.au/about-asic/dealing-wit...           None   
 44   https://www.asic.gov.au/about-asic/dealing-wit...           None   
 166  https://www.asic.gov.au/regulatory-resources/i...           None   
 118  https://www.asic.gov.au/regulatory-resources/f...           None   
 0    https://www.asic.gov.au/about-asic/asic-invest...           None   
 
     date_modified                                  short_description  \
 42           None  This is Information Sheet 1

In [23]:
print("Total URLs from Cell 2:", len(all_candidate_urls))
print("INFO sheets detected:", df_index.shape[0])

missing = set(all_candidate_urls) - set(df_index["url"])
print("URLs visited but not classified as INFO:", len(missing))
list(missing)[:20]


Total URLs from Cell 2: 201
INFO sheets detected: 199
URLs visited but not classified as INFO: 2


['https://www.asic.gov.au/regulatory-resources/financial-services/financial-product-disclosure/notification-requirements-for-product-disclosure-statements/',
 'https://www.asic.gov.au/regulatory-resources/financial-services/financial-product-disclosure/shorter-pdss-complying-with-requirements-for-superannuation-products-simple-managed-investment-schemes-and-simple-sub-fund-products/']

In [24]:
#cell 4

import os, glob, shutil
from google.colab import files

# --- Safety: confirm these exist ---
print("OUT_DIR:", OUT_DIR)
print("CSV:", csv_path)

pdf_dir = os.path.join(OUT_DIR, "pdf")
pdfs = sorted(glob.glob(os.path.join(pdf_dir, "*.pdf")))

print(f"PDF folder exists? {os.path.isdir(pdf_dir)}")
print(f"PDF count: {len(pdfs)}")
if pdfs:
    print("First 3 PDFs:", [os.path.basename(p) for p in pdfs[:3]])

# Show folder size (MB)
def folder_size_mb(path):
    total = 0
    for root, _, files_ in os.walk(path):
        for f in files_:
            fp = os.path.join(root, f)
            if os.path.exists(fp):
                total += os.path.getsize(fp)
    return total / (1024 * 1024)

print(f"OUT_DIR size: {folder_size_mb(OUT_DIR):.2f} MB")

# --- Create zip in the same OUT_DIR parent (usually /content) ---
zip_base = os.path.abspath(OUT_DIR)  # ensures correct path
zip_path = shutil.make_archive(zip_base, "zip", root_dir=OUT_DIR)

print("Created zip:", zip_path)
print(f"Zip size: {os.path.getsize(zip_path)/(1024*1024):.2f} MB")

# --- Download ---
files.download(zip_path)
files.download(csv_path)


OUT_DIR: asic_information_sheets
CSV: asic_information_sheets/asic_information_sheets_index.csv
PDF folder exists? True
PDF count: 199
First 3 PDFs: ['INFO-001_Administrative_hearings.pdf', 'INFO-009_ASIC_decisions_Your_rights.pdf', 'INFO-014_Bankruptcy_and_personal_insolvency_agreements.pdf']
OUT_DIR size: 12.11 MB
Created zip: /content/asic_information_sheets.zip
Zip size: 9.71 MB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>