In [12]:
import feedparser

def get_recent_arxiv_ids(category="cs.LG", max_results=100):
    base_url = f"https://export.arxiv.org/api/query?search_query=cat:{category}&sortBy=submittedDate&sortOrder=descending&max_results={max_results}"
    feed = feedparser.parse(base_url)
    ids = []

    for entry in feed.entries:
        arxiv_id = entry.id.split('/abs/')[-1]
        ids.append(arxiv_id)

    return ids

In [13]:
import requests
import os
from time import sleep

def download_arxiv_html(paper_id, output_dir="arxiv_html"):
    url = f"https://arxiv.org/html/{paper_id}"
    response = requests.get(url)

    if response.status_code == 200:
        os.makedirs(output_dir, exist_ok=True)
        filepath = os.path.join(output_dir, f"{paper_id}.html")
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"✅ Downloaded {paper_id}")
    else:
        print(f"❌ Failed to download {paper_id}: HTTP {response.status_code}")


In [None]:
# Example usage
paper_ids = get_recent_arxiv_ids("cs.AI", 30)

for pid in paper_ids:
    download_arxiv_html(pid, "arXiv/")
    sleep(1)  # Be polite to the server

✅ Downloaded 2503.16421v1
✅ Downloaded 2503.16416v1
✅ Downloaded 2503.16412v1
❌ Failed to download 2503.16408v1: HTTP 404
✅ Downloaded 2503.16402v1
✅ Downloaded 2503.16399v1
✅ Downloaded 2503.16394v1
✅ Downloaded 2503.16392v1
✅ Downloaded 2503.16389v1
✅ Downloaded 2503.16385v1
✅ Downloaded 2503.16371v1
✅ Downloaded 2503.16365v1
❌ Failed to download 2503.16364v1: HTTP 404
✅ Downloaded 2503.16356v1
✅ Downloaded 2503.16348v1
✅ Downloaded 2503.16342v1
❌ Failed to download 2503.16335v1: HTTP 404
✅ Downloaded 2503.16328v1
✅ Downloaded 2503.16326v1
✅ Downloaded 2503.16311v1
❌ Failed to download 2503.16307v1: HTTP 404
✅ Downloaded 2503.16304v1
✅ Downloaded 2503.16302v1
✅ Downloaded 2503.16290v1
✅ Downloaded 2503.16248v1
✅ Downloaded 2503.16227v1
✅ Downloaded 2503.16212v1
✅ Downloaded 2503.16203v1
❌ Failed to download 2503.16191v1: HTTP 404
✅ Downloaded 2503.16184v1
