In [None]:
from datetime import date
from calendar import month_name, month_abbr

BASE = "https://wilsonassetmanagement.com.au/wp-content/uploads"
TICKERS = ["WAA","WAM","WAR","WAX","WGB","WLE","WMA","WMI","WMX"]

# July=1 ... June=12
def prefix_for_report_month(m):  # m: 1..12
    return ((m - 7) % 12) + 1

# directory month/year = report month + 1 (wrap year if Dec)
def dir_year_month(year, m):
    if m == 12:
        return year + 1, 1
    return year, m + 1

# filename month variants: full + abbrev (dedup)
def month_variants(m):
    full = month_name[m]         # e.g., "June"
    abbr = month_abbr[m]         # e.g., "Jun"
    variants = {full}
    # Add abbrev if different from full (May == May)
    if abbr and abbr != full:
        variants.add(abbr)
    # Also handle special inconsistencies seen in examples:
    # use 3-letter abbrev for long names (Aug, Sep, Oct, Nov, Dec, Jan, Feb, Mar, Apr, Jun, Jul)
    # Already covered by month_abbr.
    return sorted(variants)

def estimate_urls_for_year(year=2025):
    urls = []
    for report_month in range(1, 13):  # Jan..Dec 2025 reports
        prefix = prefix_for_report_month(report_month)
        dir_y, dir_m = dir_year_month(year, report_month)
        for mv in month_variants(report_month):
            for t in TICKERS:
                fname = f"{prefix}.-{mv}-2025_{t}.pdf"
                url = f"{BASE}/{dir_y:04d}/{dir_m:02d}/{fname}"
                urls.append({
                    "report_year": year,
                    "report_month": report_month,
                    "dir_year": dir_y,
                    "dir_month": dir_m,
                    "prefix": prefix,
                    "month_text": mv,
                    "ticker": t,
                    "url": url
                })
    return urls

if __name__ == "__main__":
    candidates = estimate_urls_for_year(2025)
    # Show a few examples
    for row in candidates[:12]:
        print(row["url"])
    print(f"\nTotal candidates: {len(candidates)}")


https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WAA.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WAM.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WAR.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WAX.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WGB.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WLE.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WMA.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WMI.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-Jan-2025_WMX.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-January-2025_WAA.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-January-2025_WAM.pdf
https://wilsonassetmanagement.com.au/wp-content/uploads/2025/02/7.-J

In [None]:
"""
Brute-force WAM candidate report URLs (2025), send each PDF's text to ChatGPT (GPT-5) to
extract Top 10 or Top 20 holdings (codes only), and aggregate to a pandas DataFrame.

Output columns: date (e.g., 'June-2025'), client ('Wilson Asset Management'),
portfolio (ticker like 'WAR'), stock (e.g., 'BHP').

Notes:
- UNSAFE by request: API key is embedded directly.
- We HEAD-check URLs to avoid wasting tokens on 404s.
- We chunk PDF text to keep prompts within context limits and union codes from all chunks.
- Company names are ignored — only stock codes are returned.
"""

import re
import math
import json
import requests
import pdfplumber
import pandas as pd
from io import BytesIO
from calendar import month_name, month_abbr
from openai import OpenAI

# ── Config ──────────────────────────────────────────────────────────────────────
MODEL = "gpt-5"                       # Use your preferred/latest ChatGPT model
BASE = "https://wilsonassetmanagement.com.au/wp-content/uploads"
YEAR = 2025
TICKERS = ["WAA","WAM","WAR","WAX","WGB","WLE","WMA","WMI","WMX"]
MAX_CHARS_PER_CHUNK = 12000
HTTP_TIMEOUT = 45
DO_HEAD_CHECK = True  # set False to skip HEAD existence check

client = OpenAI(api_key=API_KEY)

# ── Helpers to generate candidate URLs per your rules ───────────────────────────
def prefix_for_report_month(m):  # July=1 ... June=12
    return ((m - 7) % 12) + 1

def dir_year_month(year, m):     # directory is one month ahead of report
    return (year + 1, 1) if m == 12 else (year, m + 1)

def month_variants(m):
    full = month_name[m]    # e.g. "June"
    abbr = month_abbr[m]    # e.g. "Jun"
    out = {full}
    if abbr and abbr != full:
        out.add(abbr)
    return sorted(out)

def estimate_urls_for_year(year=YEAR):
    rows = []
    for report_month in range(1, 13):  # Jan..Dec reports
        prefix = prefix_for_report_month(report_month)
        dy, dm = dir_year_month(year, report_month)
        for mv in month_variants(report_month):
            for t in TICKERS:
                fname = f"{prefix}.-{mv}-2025_{t}.pdf"
                url = f"{BASE}/{dy:04d}/{dm:02d}/{fname}"
                rows.append({
                    "date": f"{mv}-2025",  # display date from filename (e.g., June-2025)
                    "ticker": t,
                    "url": url
                })
    return rows

# ── Networking / PDF text ───────────────────────────────────────────────────────
def url_exists(url: str) -> bool:
    try:
        r = requests.head(url, timeout=HTTP_TIMEOUT, allow_redirects=True)
        if r.status_code == 200:
            return True
        # some servers don’t allow HEAD; fall back to GET of minimal bytes
        if r.status_code in (403, 405):
            g = requests.get(url, stream=True, timeout=HTTP_TIMEOUT)
            ok = g.status_code == 200
            g.close()
            return ok
        return False
    except requests.RequestException:
        return False

def fetch_pdf_text(url: str) -> str:
    r = requests.get(url, timeout=HTTP_TIMEOUT)
    r.raise_for_status()
    with pdfplumber.open(BytesIO(r.content)) as pdf:
        parts = []
        for i, page in enumerate(pdf.pages, start=1):
            t = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True) or ""
            parts.append(f"\n\n--- PAGE {i} ---\n{t}")
        return "\n".join(parts)

def chunk_text(text: str, max_chars: int):
    n = math.ceil(len(text) / max_chars) or 1
    for i in range(n):
        yield text[i*max_chars:(i+1)*max_chars], i+1, n

# ── LLM prompt (strict: JSON only, codes only) ──────────────────────────────────
SYSTEM_MSG = (
    "You are a strict extraction engine. Output JSON only. No prose, no markdown."
)

def prompt_for_codes(date_label: str, portfolio: str, url: str, text_chunk: str) -> str:
    # Company names not required; codes only. Accept Top 10 or Top 20 if present.
    return f"""
Extract stock holding codes from this PDF text chunk for the section titled
"Top 10 holdings" or "Top 20 holdings". Return ONLY valid JSON with this shape:

{{
  "list_type": "Top 10" | "Top 20" | "None",
  "codes": ["ANZ","BHP","..."]   // up to 20 codes, uppercase, no $ sign
}}

Rules:
- If you cannot find a Top 10 or Top 20 holdings list in this chunk, return:
  {{"list_type": "None", "codes": []}}
- Do NOT include company names.
- Do NOT include duplicates.
- Codes should be A-Z / 0-9 / '.' / '-' only.
- Keep order as seen top-to-bottom if visible.

Meta:
- date: {date_label}
- portfolio: {portfolio}
- source_url: {url}

TEXT_CHUNK:
{text_chunk}
""".strip()

def llm_extract_codes(date_label: str, portfolio: str, url: str, text: str) -> list[str]:
    """Send all chunks; union unique codes. Prefer Top 20 if any chunk reports it."""
    all_codes = set()
    saw_top20 = False
    for chunk, idx, total in chunk_text(text, MAX_CHARS_PER_CHUNK):
        user_prompt = prompt_for_codes(date_label, portfolio, url, chunk)
        resp = client.chat.completions.create(
            model=MODEL,
            temperature=0,
            messages=[
                {"role": "system", "content": SYSTEM_MSG},
                {"role": "user", "content": user_prompt},
            ],
        )
        raw = resp.choices[0].message.content.strip()
        # Strip code fences defensively
        if raw.startswith("```"):
            raw = raw.strip("`")
            raw = re.sub(r"^json", "", raw, flags=re.I).strip()
        try:
            obj = json.loads(raw)
        except json.JSONDecodeError:
            continue
        lt = str(obj.get("list_type", "None"))
        codes = obj.get("codes", []) or []
        # prioritize top20 if ever seen
        if lt.lower() == "top 20":
            saw_top20 = True
        for c in codes:
            c = c.strip().upper()
            if re.fullmatch(r"[A-Z0-9.\-]{1,10}", c):
                all_codes.add(c)
    # If we saw Top 20 at least once, keep up to 20; else keep up to 10
    final_n = 20 if saw_top20 else 10
    return list(sorted(all_codes))[:final_n]

# ── Run: generate candidates → filter → extract → DataFrame ─────────────────────
def main():
    candidates = estimate_urls_for_year(YEAR)
    rows = []
    for item in candidates:
        date_label = item["date"]        # e.g., "June-2025"
        ticker = item["ticker"]          # portfolio code, e.g., "WAR"
        url = item["url"]

        if DO_HEAD_CHECK and not url_exists(url):
            continue

        try:
            text = fetch_pdf_text(url)
        except Exception:
            continue

        codes = llm_extract_codes(date_label, ticker, url, text)
        if not codes:
            # log a placeholder if you want visibility of misses
            continue

        for code in codes:
            rows.append({
                "date": date_label,
                "client": "Wilson Asset Management",
                "portfolio": ticker,
                "stock": code,
            })

    df = pd.DataFrame(rows, columns=["date","client","portfolio","stock"])
    print(df)
    # df.to_csv("wam_holdings_2025_llm.csv", index=False)

if __name__ == "__main__":
    main()



BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'temperature' does not support 0 with this model. Only the default (1) value is supported.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_value'}}