In [1]:
import requests
import tarfile
import shutil
import time
import csv
import os
import re
from lxml import etree

In [2]:
# Fixed variables
directory = "source_files/"
metadata_file_path = directory + "papers.csv"
MIN_CREATED_YEAR = 2020

# Creating directory
os.makedirs(directory, exist_ok=True)

# Creating metadata file
if os.path.isfile(metadata_file_path) == False:
    with open(metadata_file_path, "w"):
        pass

In [3]:
def query_arxiv(spec, date):
    
    #Statistics
    count_success = 0
    count_failed = 0
    count_already_existing = 0

    # Request query
    url = f'https://export.arxiv.org/oai2?verb=ListRecords&set={spec}&from={date}&until={date}&metadataPrefix=arXiv'
    print(url)

    # Request response
    response = requests.get(url)
    root = etree.fromstring(response.content)

    #print(etree.tostring(root, pretty_print=True, encoding='unicode'))
    
    entry_ids = []
    entry_titles = []
    
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'arxiv': 'http://arxiv.org/OAI/arXiv/'
    }
    print("\nFound records:")
    record_list = root.findall('.//oai:ListRecords/oai:record', namespaces=namespaces)
    print(len(record_list))
    for record in record_list:
        fid = record.find('.//arxiv:id', namespaces=namespaces)
        title = record.find('.//arxiv:title', namespaces=namespaces)
        license = record.find('.//arxiv:license', namespaces=namespaces)
        created = record.find('.//arxiv:created', namespaces=namespaces)
        #print(fid.text, title.text.replace("\n", "").replace("  ", " "), license.text, created.text)
        
        year_match = re.match("(\d\d\d\d)-\d\d\-\d\d", created.text)
        if year_match:
            if int(year_match.group(1)) >= MIN_CREATED_YEAR:
                if (license.text == "http://creativecommons.org/licenses/by/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-sa/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                license.text == "http://creativecommons.org/publicdomain/zero/1.0/"):
                    entry_ids.append(fid.text)
                    entry_titles.append(title.text.replace("\n", "").replace("  ", " "))
                    #print("Success!")
                elif (license.text != "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"):
                    print("Bad license:", license.text)

    print("\nNumber of entries:", len(entry_ids))
    
    # Downloading the papers
    for i in range(len(entry_ids)):
        if check_new_paper(entry_ids[i]):
            time.sleep(3)
            if download_source_paper(entry_ids[i], entry_titles[i], spec):
                print(f"{i+1}. Paper: {entry_ids[i]} was successfully downloaded and extracted.")
                count_success += 1
            else:
                print(f"{i+1}. Paper: {entry_ids[i]} was not downloaded/extracted.")
                count_failed += 1
            #break
        else:
            print(f"{i+1}. Paper: {entry_ids[i]} is already existing in the collected dataset.")
            count_already_existing += 1
    
    print("\nStatistics:")
    print(f"Successfully downloaded papers: {count_success}")
    print(f"Failed downloaded papers: {count_failed}")
    print(f"Already existing papers: {count_already_existing}")
    
# Download the found ids and extract tar.gz archive
def download_source_paper(entry_id, entry_title, spec):       
    # Downloading
    source_link = "https://arxiv.org/src/" + entry_id

    response = requests.get(source_link)
    #print(response.content)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        if re.fullmatch(r".*\.tar.gz", filename) == False:
            print(f"File is of unexpected data format. ({filename})")
            return False
    else:
        print(f"File name was not found. ({entry_id})")
        return False
    filepath = directory + filename
    with open(filepath, "wb") as file:
        file.write(response.content)

    # Extracting
    try:
        extracting_path = directory + entry_id.replace('/', '-')
        t = tarfile.open(filepath)
        t.extractall(path = extracting_path)
        t.close()

        # Saving meta information
        with open(metadata_file_path, "a", encoding="utf-8") as meta_file:
            meta_file.write(entry_id + ";" + entry_title + ";" + spec + "\n")

    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        if os.path.isdir(extracting_path):
            shutil.rmtree(extracting_path) # Removing the paper
        os.remove(filepath)
        return False

    # Deleting downloaded archive/file
    os.remove(filepath)
    
    return True

# Check whether the paper is new or already downloaded
def check_new_paper(entry_id):
    with open (metadata_file_path, "r") as metadata_file:
        spamreader = csv.reader(metadata_file, delimiter=';', quotechar='|')
        for row in spamreader:
            if row[0] == entry_id:
                return False
    return True

In [4]:
query_arxiv("cs", "2024-09-30")

https://export.arxiv.org/oai2?verb=ListRecords&set=cs&from=2024-09-30&until=2024-09-30&metadataPrefix=arXiv

Found records:
598

Number of entries: 336
1. Paper: 2011.04094 was successfully downloaded and extracted.
2. Paper: 2102.00733 was successfully downloaded and extracted.
Error for arXiv-2103.11338v1.pdf: file could not be opened successfully:
3. Paper: 2103.11338 was not downloaded/extracted.
4. Paper: 2204.13215 was successfully downloaded and extracted.
5. Paper: 2206.07282 was successfully downloaded and extracted.
6. Paper: 2208.04284 was successfully downloaded and extracted.
7. Paper: 2303.11192 was successfully downloaded and extracted.
8. Paper: 2304.09510 was successfully downloaded and extracted.
9. Paper: 2305.01851 was successfully downloaded and extracted.
10. Paper: 2305.02785 was successfully downloaded and extracted.
11. Paper: 2307.16446 was successfully downloaded and extracted.
12. Paper: 2309.07322 was successfully downloaded and extracted.
13. Paper: 2309.1

118. Paper: 2409.17763 was successfully downloaded and extracted.
119. Paper: 2409.17827 was successfully downloaded and extracted.
120. Paper: 2409.17851 was successfully downloaded and extracted.
121. Paper: 2409.17985 was successfully downloaded and extracted.
122. Paper: 2409.17993 was successfully downloaded and extracted.
123. Paper: 2409.17994 was successfully downloaded and extracted.
Error for arXiv-2409.18052v2.pdf: file could not be opened successfully:
124. Paper: 2409.18052 was not downloaded/extracted.
Error for arXiv-2409.18132v1.gz: file could not be opened successfully:
125. Paper: 2409.18132 was not downloaded/extracted.
126. Paper: 2409.18156 was successfully downloaded and extracted.
Error for arXiv-2409.18157v1.pdf: file could not be opened successfully:
127. Paper: 2409.18157 was not downloaded/extracted.
128. Paper: 2409.18158 was successfully downloaded and extracted.
Error for arXiv-2409.18162v1.pdf: file could not be opened successfully:
129. Paper: 2409.18162

234. Paper: 2409.18585 was successfully downloaded and extracted.
235. Paper: 2409.18586 was successfully downloaded and extracted.
236. Paper: 2409.18590 was successfully downloaded and extracted.
237. Paper: 2409.18591 was successfully downloaded and extracted.
238. Paper: 2409.18592 was successfully downloaded and extracted.
Error for arXiv-2409.18597v1.pdf: file could not be opened successfully:
239. Paper: 2409.18597 was not downloaded/extracted.
240. Paper: 2409.18601 was successfully downloaded and extracted.
241. Paper: 2409.18606 was successfully downloaded and extracted.
242. Paper: 2409.18611 was successfully downloaded and extracted.
Error for arXiv-2409.18612v1.pdf: file could not be opened successfully:
243. Paper: 2409.18612 was not downloaded/extracted.
244. Paper: 2409.18616 was successfully downloaded and extracted.
245. Paper: 2409.18620 was successfully downloaded and extracted.
246. Paper: 2409.18626 was successfully downloaded and extracted.
247. Paper: 2409.18629