# Download PDFs for Extraction

## Libraries

In [2]:
import os, re, time, requests
import pandas as pd
from pathlib import Path

# === CONFIG ===
CONTACT_EMAIL = "ebuchanan@harrisburgu.edu"   # <- REQUIRED for Unpaywall; use a real email
INPUT_CSV = "../03.classify_data/classified_lab.csv"  # or wherever your df lives
DOI_COL = "doi"
OUT_DIR = Path("pdfs")
POLITE_DELAY = 0.3               # seconds between network calls

# === Helpers ===
HEADERS_JSON = {"Accept": "application/json", "User-Agent": f"LAB-pdf-fetcher ({CONTACT_EMAIL})"}
HEADERS_ANY  = {"User-Agent": f"LAB-pdf-fetcher ({CONTACT_EMAIL})"}

def clean_doi(doi: str) -> str:
    if not isinstance(doi, str): return ""
    doi = doi.strip()
    doi = re.sub(r"^https?://(dx\.)?doi\.org/", "", doi, flags=re.I)
    return doi

def safe_get(url, params=None, headers=None, timeout=30, stream=False, allow_redirects=True):
    try:
        r = requests.get(url, params=params, headers=headers, timeout=timeout, stream=stream, allow_redirects=allow_redirects)
        r.raise_for_status()
        return r, None
    except requests.exceptions.RequestException as e:
        return None, str(e)

def find_pdf_via_unpaywall(doi: str):
    # https://api.unpaywall.org/v2/{doi}?email=...
    url = f"https://api.unpaywall.org/v2/{doi}"
    r, err = safe_get(url, params={"email": CONTACT_EMAIL}, headers=HEADERS_JSON)
    if err or not r.ok:
        return None, f"unpaywall err: {err or r.status_code}"
    js = r.json()
    # best_oa_location.url_for_pdf or any oa_location url_for_pdf
    cand = (js.get("best_oa_location") or {}).get("url_for_pdf")
    if not cand:
        for loc in js.get("oa_locations") or []:
            if loc.get("url_for_pdf"): 
                cand = loc["url_for_pdf"]; 
                break
    return cand, None

def find_pdf_via_openalex(doi: str):
    # https://api.openalex.org/works/DOI:{doi}?select=best_oa_location,primary_location,open_access
    url = f"https://api.openalex.org/works/DOI:{doi}"
    r, err = safe_get(url, params={"select":"best_oa_location,primary_location,open_access"}, headers=HEADERS_JSON)
    if err or not r.ok:
        return None, f"openalex err: {err or r.status_code}"
    js = r.json()
    # try best_oa_location.pdf_url, then primary_location.pdf_url, then open_access.oa_url (may be landing page)
    for key in ("best_oa_location","primary_location"):
        loc = js.get(key) or {}
        if isinstance(loc, dict) and loc.get("pdf_url"):
            return loc["pdf_url"], None
    oa = js.get("open_access") or {}
    if oa.get("oa_url") and oa["oa_url"].lower().endswith(".pdf"):
        return oa["oa_url"], None
    return None, None

def find_pdf_via_crossref(doi: str):
    # https://api.crossref.org/works/{doi} -> message.link[] where content-type==application/pdf
    url = f"https://api.crossref.org/works/{doi}"
    r, err = safe_get(url, headers=HEADERS_JSON)
    if err or not r.ok:
        return None, f"crossref err: {err or r.status_code}"
    msg = r.json().get("message", {})
    for link in msg.get("link", []) or []:
        if (link.get("content-type") or "").lower() == "application/pdf" and link.get("URL"):
            return link["URL"], None
    return None, None

def resolve_doi_to_pdf(doi: str):
    # Follow DOI redirect; if final response is a PDF, accept it
    url = f"https://doi.org/{doi}"
    r, err = safe_get(url, headers=HEADERS_ANY, timeout=30, stream=True, allow_redirects=True)
    if err or not r:
        return None, f"doi resolve err: {err}"
    # If headers say it's a PDF, use the final URL
    ctype = (r.headers.get("Content-Type") or "").lower()
    final_url = r.url
    if "application/pdf" in ctype or final_url.lower().endswith(".pdf"):
        # We'll re-download with non-stream to get full content
        return final_url, None
    return None, None

def download_pdf(pdf_url: str, out_path: Path):
    r, err = safe_get(pdf_url, headers=HEADERS_ANY, stream=True, timeout=60)
    if err or not r:
        return False, f"download err: {err or 'unknown'}"
    ctype = (r.headers.get("Content-Type") or "").lower()
    if "application/pdf" not in ctype and not pdf_url.lower().endswith(".pdf"):
        # Some servers misreport; try anyway if content starts with %PDF
        chunk = next(r.iter_content(chunk_size=5*1024), b"")
        if not chunk.startswith(b"%PDF"):
            return False, f"not pdf (ctype={ctype})"
        # If it is PDF, write first chunk + rest
        with open(out_path, "wb") as f:
            f.write(chunk)
            for chunk in r.iter_content(chunk_size=64*1024):
                if chunk:
                    f.write(chunk)
        return True, None
    # Normal path
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=64*1024):
            if chunk:
                f.write(chunk)
    return True, None

def safe_filename_from_doi(doi: str) -> str:
    # turn 10.1234/abc.def -> 10.1234_abc.def.pdf
    return re.sub(r"[^A-Za-z0-9._-]+", "_", doi) + ".pdf"

def fetch_pdf_for_doi(doi: str, out_dir: Path = OUT_DIR):
    doi = clean_doi(doi)
    if not doi:
        return "skip", "no-doi", None
    steps = [
        ("unpaywall", find_pdf_via_unpaywall),
        ("openalex",  find_pdf_via_openalex),
        ("crossref",  find_pdf_via_crossref),
        ("resolve",   resolve_doi_to_pdf),
    ]
    pdf_url = None
    notes = []
    for name, func in steps:
        url, err = func(doi)
        if url:
            pdf_url = url
            source = name
            break
        if err:
            notes.append(f"{name}:{err}")
        time.sleep(POLITE_DELAY)
    if not pdf_url:
        return "miss", "; ".join(notes) or "no-pdf-found", None

    out_dir.mkdir(parents=True, exist_ok=True)
    fname = safe_filename_from_doi(doi)
    out_path = out_dir / fname
    ok, derr = download_pdf(pdf_url, out_path)
    if not ok:
        return "fail", f"{source} -> {derr}", pdf_url
    return "ok", source, str(out_path)

## Attempt Download

In [4]:
# === Run over your dataframe ===
df = pd.read_csv(INPUT_CSV)
df.head()

Unnamed: 0,title,year,doi,venue,authors,abstract,keywords,openalex_id,is_oa,cited_by,doi_clean,text,class
0,The Flickr frequency norms: What 17 years of i...,2022,10.3758/s13428-022-02031-y,Behavior Research Methods,Marco A. Petilli; Fritz Günther; Marco Marelli,Word frequency is one of the best predictors o...,"['Computer science', 'Natural language process...",https://openalex.org/W4311282624,True,8,10.3758/s13428-022-02031-y,computer science natural language processing i...,1
1,Translation norms for Malay and English words:...,2022,10.3758/s13428-022-01977-3,Behavior Research Methods,Soon Tat Lee; Walter J. B. van Heuven; Jessica...,,"['Malay', 'Linguistics', 'Ambiguity', 'Compute...",https://openalex.org/W4304196404,True,7,10.3758/s13428-022-01977-3,malay linguistics ambiguity computer science v...,1
2,The Flickr frequency norms: what 17 years of i...,2022,10.31234/osf.io/h4q86,,Marco A. Petilli; Fritz Guenther; Marco Marelli,Word frequency is one of the best predictors o...,"['Word lists by frequency', 'Computer science'...",https://openalex.org/W4283711714,True,1,10.31234/osf.io/h4q86,word lists frequency computer science natural ...,1
3,The Lancaster Sensorimotor Norms: multidimensi...,2019,10.3758/s13428-019-01316-z,Behavior Research Methods,Dermot Lynott; Louise Connell; Marc Brysbaert;...,Abstract Sensorimotor information plays a fund...,"['Cognitive psychology', 'Psychology', 'Percep...",https://openalex.org/W2982116886,True,307,10.3758/s13428-019-01316-z,cognitive psychology psychology perception cog...,1
4,As designações para o pão nosso de cada dia: a...,2021,10.17851/2237-2083.29.1.533-588,Revista de Estudos da Linguagem,Vanessa Yida,Este artigo tem como proposta a identificação ...,"['Lexico', 'Linguistics', 'Humanities', 'Art',...",https://openalex.org/W3095649193,True,1,10.17851/2237-2083.29.1.533-588,lexico linguistics humanities art lexicon phil...,1


In [5]:
if DOI_COL not in df.columns:
    raise ValueError(f"Column '{DOI_COL}' not found in {INPUT_CSV}")

results = []
for i, doi in enumerate(df[DOI_COL].astype(str)):
    status, info, path_or_url = fetch_pdf_for_doi(doi)
    if status == "ok":
        print(f"[{i}] OK  | {doi} -> {path_or_url}")
    elif status == "miss":
        print(f"[{i}] MISS| {doi} -> {info}")
    elif status == "fail":
        print(f"[{i}] FAIL| {doi} -> {info} (url={path_or_url})")
    elif status == "skip":
        print(f"[{i}] SKIP| {doi} -> {info}")
    results.append((status, info, path_or_url))
    time.sleep(POLITE_DELAY)

# Optional: attach results back to the dataframe for your own bookkeeping (not required)
df["pdf_status"] = [r[0] for r in results]
df["pdf_note"]   = [r[1] for r in results]
df["pdf_path"]   = [r[2] for r in results]
df.to_csv("with_pdf_status.csv", index=False)
print("Wrote status file -> with_pdf_status.csv")
print(f"PDFs saved in: {OUT_DIR.resolve()}")

[0] OK  | 10.3758/s13428-022-02031-y -> pdfs/10.3758_s13428-022-02031-y.pdf
[1] OK  | 10.3758/s13428-022-01977-3 -> pdfs/10.3758_s13428-022-01977-3.pdf
[2] OK  | 10.31234/osf.io/h4q86 -> pdfs/10.31234_osf.io_h4q86.pdf
[3] OK  | 10.3758/s13428-019-01316-z -> pdfs/10.3758_s13428-019-01316-z.pdf
[4] OK  | 10.17851/2237-2083.29.1.533-588 -> pdfs/10.17851_2237-2083.29.1.533-588.pdf
[5] OK  | 10.1371/journal.pone.0211336 -> pdfs/10.1371_journal.pone.0211336.pdf
[6] OK  | 10.3758/s13428-022-01923-3 -> pdfs/10.3758_s13428-022-01923-3.pdf
[7] OK  | 10.3758/s13428-022-01810-x -> pdfs/10.3758_s13428-022-01810-x.pdf
[8] OK  | 10.3389/fpsyg.2019.00278 -> pdfs/10.3389_fpsyg.2019.00278.pdf
[9] MISS| 10.1055/s-0039-3400972 -> openalex:openalex err: 404 Client Error: NOT FOUND for url: https://api.openalex.org/works/DOI:10.1055/s-0039-3400972?select=best_oa_location%2Cprimary_location%2Copen_access
[10] OK  | 10.3758/s13428-023-02198-y -> pdfs/10.3758_s13428-023-02198-y.pdf
[11] MISS| 10.1044/2018_ajsl

OSError: Cannot save file into a non-existent directory: 'output_data'