In [None]:
import os, textwrap, time, requests
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv
from urllib.parse import urljoin

# ------------------ ENV & OpenAI ------------------
load_dotenv(override=True)
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

UA        = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
             "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36")
BASE_URL  = "https://www.cambridge.org"
JFQA_URL  = f"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue"

# ------------------ Helpers ------------------
def fetch_latest_issue(url: str) -> list[dict]:
    """Return unique {title, link} dicts for each research article."""
    soup = BeautifulSoup(
        requests.get(url, headers={"User-Agent": UA}, timeout=30).text,
        "html.parser"
    )

    anchors = soup.find_all("a", href=lambda h: h and "/article/" in h)
    seen, articles = set(), []
    for a in anchors:
        href = a["href"].split("?")[0]           # strip tracking params
        if href in seen:                         # de‑duplicate
            continue
        seen.add(href)
        title = a.get_text(" ", strip=True)
        full  = urljoin(BASE_URL, href)
        articles.append({"title": title, "link": full})
    print(f"Found {len(articles)} unique article links.")
    return articles

def fetch_article_details(link: str) -> dict:
    soup = BeautifulSoup(
        requests.get(link, headers={"User-Agent": UA}, timeout=30).text,
        "html.parser"
    )

    # abstract
    abs_tag   = soup.find("div", class_="abstract")
    abstract  = abs_tag.get_text(" ", strip=True) if abs_tag else "N/A"

    # publication date (meta is most reliable)
    meta_date = soup.find("meta", attrs={"name": "citation_publication_date"})
    pub_date  = meta_date["content"] if meta_date else "N/A"

    # authors (multiple <meta name="citation_author"> tags)
    authors   = [m["content"] for m in soup.find_all("meta",
                       attrs={"name": "citation_author"})]
    authors_str = ", ".join(authors) or "N/A"

    return {"abstract": abstract, "pub_date": pub_date, "authors": authors_str}

def summarise(txt: str) -> str:
    prompt = ("Summarise the following finance‑paper abstract in 2‑3 sentences, "
              "mentioning the question, method, and main finding.\n\n"
              f"Abstract:\n{txt}")
    try:
        rsp = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system",
                 "content": "You are a helpful finance research assistant."},
                {"role": "user", "content": prompt}],
            temperature=0.2, max_tokens=120
        )
        return rsp.choices[0].message.content.strip()
    except Exception as e:
        print(f"⚠️  summarise error → {e}")
        return "Summary unavailable."

def scrape_jfqa_latest() -> None:
    for art in fetch_latest_issue(JFQA_URL):
        det = fetch_article_details(art["link"])
        if det["abstract"] == "N/A":
            print(f"\n📘 {art['title']}  —  no abstract found.")
            continue

        summary = summarise(det["abstract"])
        print(f"\n📘 {art['title']}")
        print(f"   Authors: {det['authors']}")
        print(f"   Date   : {det['pub_date']}")
        print(f"   Journal: JFQA (Latest Issue)")
        print("   Summary:", textwrap.shorten(summary, width=600, placeholder="…"))
        print("-" * 90)
        time.sleep(1.0)      # polite gap between OpenAI calls

if __name__ == "__main__":
    scrape_jfqa_latest()
