In [1]:
import requests
import gzip
import tarfile
import shutil
import os
from lxml import etree
import re

In [2]:
# Variables
start_result = 0
max_results = 100
start_date = "2020-01-01"
end_date = "2024-09-30"
directory = "source_files/"

#Statistics
count_tar_gz = 0
count_gz = 0
count_other = 0

# Request query
url = f'http://export.arxiv.org/api/query?search_query=submittedDate:[{start_date} TO {end_date}]&start={start_result}&max_results={max_results}'

# Creating directory
os.makedirs(directory, exist_ok=True)

# Request response
response = requests.get(url)
root = etree.fromstring(response.content)

# Locate the <id> tag of <entry>
namespaces = {'atom': 'http://www.w3.org/2005/Atom'}
entry_ids = root.xpath('//atom:entry/atom:id/text()', namespaces=namespaces)
entry_stems = []

# Store the extracted entry ID
#print("\n\nFound ids:")
for found_id in entry_ids:
    path_stem = found_id.split('abs/')[1]
    #print(path_stem)
    entry_stems.append(path_stem)
    
# Download the found ids and extract gz archive
for stem in entry_stems:
    # Downloading
    source_link = "https://arxiv.org/src/" + stem
    response = requests.get(source_link)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        print(f"File name: {filename}")
    else:
        print("File name was not found.")
        continue
    filepath = directory + filename
    open(filepath, "wb").write(response.content)
    
    # Extracting
    try:
        if (re.fullmatch(r".*\.tar.gz", filename)): # tar.gz archive
            count_tar_gz += 1
            t = tarfile.open(filepath)
            t.extractall(path = directory + stem.replace('/', '-'))
            t.close()
        elif (re.fullmatch(r".*\.gz", filename)): # gz archive
            count_gz += 1
            # gz archive mostly contains paper purely existing of mathematical equations
            # so I ignore them for now
            '''
            with gzip.open(filepath, 'rb') as f_in:
                os.makedirs(directory + stem.replace('/', '-'), exist_ok=True)
                with open(directory + stem.replace('/', '-') + "/paper.tex", 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            '''
        else:
            count_other += 1
            raise ValueError(f"File is of an unexpected data format.")
    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        
    # Deleting downloaded archive/file
    os.remove(filepath)
    
print("\nSuccess")
print(f"Number of tar.gz archives: {count_tar_gz}")
print(f"Number of gz archives    : {count_gz}")
print(f"Number of other files    : {count_other}")

File name: arXiv-2001.00113v2.gz
File name: arXiv-2001.00114v1.pdf
Error for arXiv-2001.00114v1.pdf: File is of an unexpected data format.
File name: arXiv-2001.00115v2.gz
File name: arXiv-2001.00116v2.tar.gz
File name: arXiv-2001.00117v1.tar.gz
File name: arXiv-2001.00118v2.gz
File name: arXiv-2001.00119v2.tar.gz
File name: arXiv-2001.00120v1.tar.gz
File name: arXiv-2001.00121v1.pdf
Error for arXiv-2001.00121v1.pdf: File is of an unexpected data format.
File name: arXiv-2001.00122v1.tar.gz
File name: arXiv-2001.00123v1.pdf
Error for arXiv-2001.00123v1.pdf: File is of an unexpected data format.
File name: arXiv-2001.00124v1.tar.gz
File name: arXiv-2001.00125v1.tar.gz
File name: arXiv-2001.00126v1.tar.gz
File name: arXiv-2001.00127v2.tar.gz
File name: arXiv-2001.00128v3.tar.gz
File name: arXiv-2001.00129v1.pdf
Error for arXiv-2001.00129v1.pdf: File is of an unexpected data format.
File name: arXiv-2001.00130v4.pdf
Error for arXiv-2001.00130v4.pdf: File is of an unexpected data format.
F