http://export.arxiv.org/api/{method_name}?{parameters}


In [None]:
import requests
import xml.etree.ElementTree as ET
import time

In [None]:
# https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-computer_science=y&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-1&date-to_date=2025-02-2&date-date_type=submitted_date&abstracts=hide&size=200&order=-announced_date_first

In [None]:
base_url = 'http://export.arxiv.org/api/query?search_query=cat:cs*+AND+submittedDate:[20250201+TO+20250228]'
expected_papers = 11574

In [None]:
# Getting the ids based on the expected/desired number of papers
# The API allows 2000 papers per request, with no more than one request being made every 3 seconds
# However, I found that using less max papers and sleeping for 5 seconds between requests works best, as the results are more consistent
def get_paper_ids(base_url, expected_papers):
    start_search_index = 0
    max_papers = 1000
    max_search = max_papers
    rounds = 1

    paper_ids = []

    while start_search_index < expected_papers:
        paginated_url = base_url + f'&start={start_search_index}&max_results={max_search}'
        print(f"Starting round {rounds} with url: {paginated_url}")
        print("Getting response from arxiv...")
        response = requests.get(paginated_url)
        print("Finding IDs...")
        root = ET.fromstring(response.text)
        entry_id = root.findall('.//{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}id')
        if entry_id is not None:
            paper_id = [item.text for item in entry_id]
            paper_ids.extend(paper_id)

        start_search_index += max_papers
        max_search += max_papers
        rounds += 1
        print("Sleeping...")
        time.sleep(5)


    return paper_ids
        

In [None]:
# Get the ids
ids = get_paper_ids(base_url, expected_papers)
print(len(ids))

# Remove duplicates
new_ids = list(set(ids))
print(len(new_ids))
final_ids = []

# Creating final list of ids without http://arxiv.org/abs/
for item in new_ids:
    final_ids.append(item.replace("http://arxiv.org/abs/", ""))
print(final_ids[-1])

In [None]:
import json

# Read from the id file created above
final_ids = json.loads(open("ids.json", "r").read())

In [None]:
import os
import tarfile
import requests
def download_papers(final_ids):
    for final_id in final_ids:

        print(f"Downloading... {final_id}")

        url = f"https://arxiv.org/e-print/{final_id}"
        response = requests.get(url)

        # Create temp directory structure
        temp_dir = "temp/papers"
        os.makedirs(temp_dir, exist_ok=True)

        # Save the file
        filename = os.path.join(temp_dir, f"{final_id}")
        with open(filename, 'wb') as f:
            f.write(response.content)

        print(f"Extracting files...")

        # Create output directory
        output_dir = f"papers/{final_id}"
        os.makedirs(output_dir, exist_ok=True)

        # Try to extract as tar.gz
        try:
            with tarfile.open(filename, "r:gz") as tar:
                members = tar.getmembers()
                
                # Filter for only .tex files
                tex_files = [member for member in members if member.name.endswith('.tex')]
                
                # Extract only the .tex files
                for tex_file in tex_files:
                    tar.extract(tex_file, path=output_dir)
                    print(f"Extracted: {tex_file.name}")
        except:
            print(f"Failed to extract: {final_id}, it may not be a tar.gz file: {response.headers['Content-Type']}")

        # Delete the temporary file
        os.remove(filename)
        print(f"Temporary file deleted: {filename}")

In [None]:
download_papers(final_ids)