In [5]:
import pdfplumber
import requests
import pandas as pd
import re
from io import BytesIO

URLS = [
    "https://wilsonassetmanagement.com.au/wp-content/uploads/2025/07/12.-June-2025_WAR.pdf",
    "https://wilsonassetmanagement.com.au/wp-content/uploads/2025/09/2.-Aug-2025_WMX.pdf",
]

def fetch_pdf_text(url):
    """Download PDF and extract all text."""
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    with pdfplumber.open(BytesIO(r.content)) as pdf:
        text = "\n".join(page.extract_text(x_tolerance=2, y_tolerance=2, layout=True) or "" for page in pdf.pages)
    return text

def extract_top_holdings(text):
    """
    Extract either Top 10 or Top 20 holdings.
    Returns list of (code, company).
    """
    pattern = r"Top\s+(?:10|20)\s+(?:equity\s+)?holdings.*?(?=WAM|Our proven|About|Key dividend|Page\s+\d+|$)"
    match = re.search(pattern, text, flags=re.I | re.S)
    if not match:
        return []
    block = match.group(0)
    holdings = []
    for line in block.splitlines():
        line = line.strip()
        if not line or "Code" in line or "Company" in line:
            continue
        m = re.match(r"^([A-Z]{2,5})\s+(.+)$", line)
        if not m:
            continue
        code, company = m.groups()
        company = re.sub(r"\s[+-]?\d+(\.\d+)?x?$", "", company).strip()
        holdings.append((code, company))
    return holdings[:20]

def main():
    rows = []
    for url in URLS:
        print(f"Processing: {url}")
        try:
            text = fetch_pdf_text(url)
            holdings = extract_top_holdings(text)
            if not holdings:
                print(f"  ⚠️ No holdings found in {url}")
                continue
            # derive month + ticker from filename
            m = re.search(r"([A-Za-z]+)-2025_(\w{3})", url)
            month, ticker = (m.group(1), m.group(2)) if m else ("Unknown","Unknown")
            for code, company in holdings:
                rows.append({
                    "firm": "Wilson Asset Management",
                    "portfolio": "WAM Funds",
                    "ticker": ticker,
                    "month": month,
                    "code": code,
                    "company": company,
                    "url": url
                })
        except Exception as e:
            print(f"  ❌ Error parsing {url}: {e}")

    df = pd.DataFrame(rows)
    print("\n=== Aggregated DataFrame ===")
    print(df)
    df.to_csv("top_holdings_war_wmx.csv", index=False)
    print("\nSaved to top_holdings_war_wmx.csv")

if __name__ == "__main__":
    main()


Cannot set gray non-stroke color because /'P229' is an invalid float value
Cannot set gray non-stroke color because /'P271' is an invalid float value


Processing: https://wilsonassetmanagement.com.au/wp-content/uploads/2025/07/12.-June-2025_WAR.pdf
  ⚠️ No holdings found in https://wilsonassetmanagement.com.au/wp-content/uploads/2025/07/12.-June-2025_WAR.pdf
Processing: https://wilsonassetmanagement.com.au/wp-content/uploads/2025/09/2.-Aug-2025_WMX.pdf
  ⚠️ No holdings found in https://wilsonassetmanagement.com.au/wp-content/uploads/2025/09/2.-Aug-2025_WMX.pdf

=== Aggregated DataFrame ===
Empty DataFrame
Columns: []
Index: []

Saved to top_holdings_war_wmx.csv


In [3]:
df