In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
import os
import random

BASE_URL = "https://arxiv.org"
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
]

def get_headers():
    return {"User-Agent": random.choice(USER_AGENTS)}

def get_topics():
    response = requests.get(BASE_URL + "/", headers=get_headers())
    soup = BeautifulSoup(response.text, 'html.parser')
    topics = []
    for a in soup.select("a[href^='/archive/']"):
        href = a.get("href")
        if re.match(r"/archive/[\w\-]+", href):
            topics.append(href.split("/")[-1])
    return list(set(topics))

def get_years_for_topic(topic):
    url = f"{BASE_URL}/archive/{topic}"
    response = requests.get(url, headers=get_headers())
    soup = BeautifulSoup(response.text, 'html.parser')
    years = []
    for a in soup.select(f"a[href^='/year/{topic}/']"):
        match = re.search(rf'/year/{topic}/(\d{{4}})', a['href'])
        if match:
            years.append(match.group(1))
    return sorted(set(years))

def get_months_for_year(topic, year):
    months = [f"{month:02d}" for month in range(1, 13)]
    return months

def get_html_links(topic, year, month):
    url = f"{BASE_URL}/list/{topic}/{year}-{month}?skip=0&show=2000"
    response = requests.get(url, headers=get_headers())
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for dt in soup.find_all("dt"):
        html_link = dt.find("a", href=re.compile(r"^https://arxiv.org/html/\d{4}\.\d{5}(v\d+)?"))
        if html_link:
            links.append(html_link["href"])
            print("Found HTML article:", html_link["href"])
        time.sleep(0.1)  # Politeness delay between items
    return links

def scrape_arxiv():
    topics = get_topics()
    for topic in topics:
        print(f"Processing topic: {topic}")
        years = get_years_for_topic(topic)
        for year in years:
            all_html_links = []
            for month in get_months_for_year(topic, year):
                print(f"  Checking {year}-{month}")
                links = get_html_links(topic, year, month)
                all_html_links.extend(links)
                time.sleep(random.uniform(2, 5))  # Delay between months
            if all_html_links:
                os.makedirs(f"arxiv_links/{topic}", exist_ok=True)
                with open(f"arxiv_links/{topic}/{year}.txt", "w") as f:
                    for link in all_html_links:
                        f.write(link + "\n")
                print(f"Saved {len(all_html_links)} links for {topic}/{year}")
            time.sleep(random.uniform(5, 10))  # Delay between years

if __name__ == "__main__":
    scrape_arxiv()
