In [6]:
import re
import requests
import pdfplumber
import pandas as pd
from io import BytesIO
from datetime import datetime

def _fetch_pdf_bytes(url: str) -> BytesIO:
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    return BytesIO(r.content)

def _extract_full_text(pdf_bytes: BytesIO) -> str:
    chunks = []
    with pdfplumber.open(pdf_bytes) as pdf:
        for page in pdf.pages:
            chunks.append(page.extract_text() or "")
    return "\n".join(chunks)

def _slice_top10_block(full_text: str) -> str:
    # start markers are loose on purpose
    for pat in [r"Top\s+10\s+equity\s+holdings.*", r"Top\s+10\s+holdings.*"]:
        m = re.search(pat, full_text, flags=re.IGNORECASE | re.DOTALL)
        if m:
            start = m.start()
            break
    else:
        raise ValueError("Top 10 holdings section not found.")

    # end markers
    tail_markers = [
        r"\n\s*Equity portfolio factor exposure",
        r"\n\s*WAM Income Maximiser Limited",
        r"\n\s*October 2025",
        r"\n\s*Our proven investment process",
        r"\n\s*About the\s+Investment Manager",
    ]
    end_positions = []
    for pat in tail_markers:
        m2 = re.search(pat, full_text[start:], flags=re.IGNORECASE)
        if m2:
            end_positions.append(start + m2.start())
    end = min(end_positions) if end_positions else len(full_text)
    return full_text[start:end]

def _parse_holdings(block: str):
    holdings = []
    for line in block.splitlines():
        line = line.strip()
        if not line or re.search(r"Code\s+Company\s+Name", line, flags=re.IGNORECASE):
            continue
        m = re.match(r"^([A-Z]{2,5})\s+(.*)$", line)
        if not m:
            continue
        code, rest = m.group(1), m.group(2).strip()
        tokens = rest.split()
        while tokens and re.match(r"^[+-]?\d+(\.\d+)?x?$", tokens[-1], flags=re.IGNORECASE):
            tokens.pop()
        company = " ".join(tokens).strip()
        if code and company:
            holdings.append((code, company))
    return holdings

def parse_wmx_top10(url: str, date_label: str, accumulator_df: pd.DataFrame | None = None) -> pd.DataFrame:
    """
    Extracts Top 10 holdings from a WMX PDF and appends to accumulator_df.
    date_label is a string you control, e.g. '2025-09-30' or 'Sep-2025'.
    """
    pdf_bytes = _fetch_pdf_bytes(url)
    text = _extract_full_text(pdf_bytes)
    block = _slice_top10_block(text)
    parsed = _parse_holdings(block)

    # keep first 10 unique by code
    seen, top10 = set(), []
    for code, company in parsed:
        if code not in seen:
            seen.add(code)
            top10.append((code, company))
        if len(top10) == 10:
            break
    if not top10:
        raise ValueError("No holdings parsed. Inspect PDF format.")

    df = pd.DataFrame(top10, columns=["code", "company"])
    df.insert(0, "firm", "Wilson Asset Management")
    df.insert(1, "portfolio", "WMX")
    df.insert(2, "date", date_label)

    if accumulator_df is None:
        return df
    else:
        return pd.concat([accumulator_df, df], ignore_index=True)

def run_for_dates(dates: list[str], url_pattern: str) -> pd.DataFrame:
    """
    dates: list of date labels you want to use as 'date' column.
    url_pattern: a Python format string that yields the PDF URL for each date.
      You can use either {date} directly or strftime codes via a parsed datetime.

    Two usage modes:
    1) If your date strings are like '2025-09-30', the pattern can use strftime:
       url_pattern = "https://.../{month_abr}-{year}_WMX.pdf"
       Build those fields below.

    2) If you already embedded the exact date text in each string and the pattern
       just inserts {date}, use only {date}.

    This function tries both. If '{date}' in pattern, it uses that directly.
    Otherwise it parses the string as YYYY-MM-DD and exposes {year}, {month}, {month_abr}.
    """
    out = None
    for d in dates:
        if "{date}" in url_pattern:
            url = url_pattern.format(date=d)
            date_label = d
        else:
            # parse ISO date
            dt = datetime.fromisoformat(d)
            ctx = {
                "year": dt.strftime("%Y"),
                "month": dt.strftime("%m"),
                "month_name": dt.strftime("%B"),
                "month_abr": dt.strftime("%b"),
            }
            url = url_pattern.format(**ctx)
            date_label = dt.strftime("%Y-%m-%d")
        out = parse_wmx_top10(url, date_label, out)
    return out

if __name__ == "__main__":
    # Example 1: direct full URLs per month using {date}
    dates = ["Sep-2025"]  # labels you want in the output
    pattern_direct = "https://wilsonassetmanagement.com.au/wp-content/uploads/2025/10/3.-{date}_WMX.pdf"
    df1 = run_for_dates(dates, pattern_direct)
    print("Direct pattern result:")
    print(df1)

    # Example 2: build from ISO dates using strftime parts
    # Adjust folder logic if their folder naming changes by month.
    iso_dates = ["2025-09-30"]
    pattern_from_parts = "https://wilsonassetmanagement.com.au/wp-content/uploads/2025/10/3.-{month_abr}-2025_WMX.pdf"
    df2 = run_for_dates(iso_dates, pattern_from_parts)
    print("\nParts pattern result:")
    print(df2)


Direct pattern result:
                      firm portfolio      date code  \
0  Wilson Asset Management       WMX  Sep-2025  ANZ   
1  Wilson Asset Management       WMX  Sep-2025  BHP   
2  Wilson Asset Management       WMX  Sep-2025  CGF   
3  Wilson Asset Management       WMX  Sep-2025  CSL   
4  Wilson Asset Management       WMX  Sep-2025  NAB   
5  Wilson Asset Management       WMX  Sep-2025  ORA   
6  Wilson Asset Management       WMX  Sep-2025  RIO   
7  Wilson Asset Management       WMX  Sep-2025  SGP   
8  Wilson Asset Management       WMX  Sep-2025  WBC   
9  Wilson Asset Management       WMX  Sep-2025  WTC   

                                             company  
0                                 ANZ Group Holdings  
1                                          BHP Group  
2                                         Challenger  
3                                                CSL  
4                            National Australia Bank  
5                                        