In [1]:
import requests
import tarfile
import shutil
import time
import csv
import os
import re
from lxml import etree
from requests.exceptions import HTTPError

In [47]:
# Fixed variables
directory = "source_files/"
metadata_file_path = directory + "papers.csv"

relevant_publication_types = ["JournalArticle", "Conference", "Review"]
MIN_CREATED_YEAR = 2023

# Creating directory
os.makedirs(directory, exist_ok=True)

# Creating metadata file
if os.path.isfile(metadata_file_path) == False:
    open(metadata_file_path, "w").close()

In [None]:
"""
# Check if the paper was published following the procedure of peer review.
def check_for_paper_publication(paper_id, paper_title):
    while (True):
        try:
            rsp = requests.get('https://api.semanticscholar.org/graph/v1/paper/search',
                                   params={'query': paper_title, 'limit': 1, 'fields': 'externalIds,publicationTypes'})
            rsp.raise_for_status()
            results = rsp.json()
            total = results["total"]
            
            if total >= 1:
                #print(results['data'][0])
                arxiv_id = results['data'][0]['externalIds']['ArXiv']
                found_publication_types = results['data'][0]['publicationTypes']
                if arxiv_id != paper_id or found_publication_types is None:
                    #print(results)
                    return False
                #print(found_publication_types)
                for pub_type in relevant_publication_types:
                    if pub_type in found_publication_types:
                        return True
            return False
        except HTTPError as e:
            time.sleep(0.5)
        except KeyError as e:
            pass
        except Exception as e:
            print(f"Error Type: {type(e).__name__}")
            error_message = str(e)[:100]
            print(f"Error Message: {error_message}")
            
    return False
"""

In [46]:
def query_arxiv(spec, date):
    
    #Statistics
    count_success = 0
    count_failed = 0

    # Request query
    url = f'https://export.arxiv.org/oai2?verb=ListRecords&set={spec}&from={date}&until={date}&metadataPrefix=arXiv'
    print(url)

    # Request response
    response = requests.get(url)
    root = etree.fromstring(response.content)

    #print(etree.tostring(root, pretty_print=True, encoding='unicode'))
    
    entry_ids = []
    entry_titles = []
    
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'arxiv': 'http://arxiv.org/OAI/arXiv/'
    }
    record_list = root.findall('.//oai:ListRecords/oai:record', namespaces=namespaces)
    print("\nFound records:",len(record_list))
    record_counter = 0
    for record in record_list:
        record_counter += 1
        if record_counter % 50 == 0:
            print(str(record_counter) + " records processed.")
        
        try:
            fid = record.find('.//arxiv:id', namespaces=namespaces).text
            title = record.find('.//arxiv:title', namespaces=namespaces).text.replace("\n", "").replace("  ", " ")
            license = record.find('.//arxiv:license', namespaces=namespaces).text
            created = record.find('.//arxiv:created', namespaces=namespaces).text
            #print(fid.text, title.text.replace("\n", "").replace("  ", " "), license.text, created.text)

            year_match = re.match("(\d\d\d\d)-\d\d\-\d\d", created)
            if year_match:
                if int(year_match.group(1)) >= MIN_CREATED_YEAR:
                    if (license == "http://creativecommons.org/licenses/by/4.0/" or
                    license == "http://creativecommons.org/licenses/by-sa/4.0/" or
                    license == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                    license == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                    license == "http://creativecommons.org/publicdomain/zero/1.0/"):
                        if check_new_paper(fid):
                            entry_ids.append(fid)
                            entry_titles.append(title)
                        #print("Success!")
                    elif (license != "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"):
                        print("Bad license:", license.text)
        except Exception as e:
            print(f"Error Type: {type(e).__name__}")
            error_message = str(e)[:100]
            print(f"Error Message: {error_message}")

    print("\nNumber of entries:", len(entry_ids))
    
    # Downloading the papers
    for i in range(len(entry_ids)):
        time.sleep(3)
        if download_source_paper(entry_ids[i], entry_titles[i], spec):
            print(f"{i+1}. Paper: {entry_ids[i]} was successfully downloaded and extracted.")
            count_success += 1
        else:
            print(f"{i+1}. Paper: {entry_ids[i]} was not downloaded/extracted.")
            count_failed += 1
        #break
    
    print("\nStatistics:")
    print(f"Successfully downloaded papers: {count_success}")
    print(f"Failed downloaded papers: {count_failed}")
    
# Download the found ids and extract tar.gz archive
def download_source_paper(entry_id, entry_title, spec):       
    # Downloading
    source_link = "https://arxiv.org/src/" + entry_id

    response = requests.get(source_link)
    #print(response.content)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        if re.fullmatch(r".*\.tar.gz", filename) == False:
            print(f"File is of unexpected data format. ({filename})")
            return False
    else:
        print(f"File name was not found. ({entry_id})")
        return False
    filepath = directory + filename
    with open(filepath, "wb") as file:
        file.write(response.content)

    # Extracting
    try:
        extracting_path = directory + entry_id.replace('/', '-')
        t = tarfile.open(filepath)
        t.extractall(path = extracting_path)
        t.close()

        # Saving meta information
        with open(metadata_file_path, "a", encoding="utf-8") as meta_file:
            meta_file.write(entry_id + ";" + entry_title + ";" + spec + "\n")

    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        try:
            if os.path.isdir(extracting_path):
                shutil.rmtree(extracting_path) # Removing the paper
            os.remove(filepath)
        except:
            pass
        return False

    # Deleting downloaded archive/file
    os.remove(filepath)
    
    return True

# Check whether the paper is new or already downloaded
def check_new_paper(entry_id):
    with open (metadata_file_path, "r") as metadata_file:
        spamreader = csv.reader(metadata_file, delimiter=';', quotechar='|')
        for row in spamreader:
            if row[0] == entry_id:
                return False
    return True

In [28]:
# Select spec and date
spec = "cs"
date = "2024-09-27"

# Download papers from a specific date
query_arxiv(spec, date)

https://export.arxiv.org/oai2?verb=ListRecords&set=cs&from=2024-09-27&until=2024-09-27&metadataPrefix=arXiv

Found records: 713
50 records processed.
100 records processed.
150 records processed.
200 records processed.
250 records processed.
300 records processed.
350 records processed.
400 records processed.
450 records processed.
500 records processed.
550 records processed.
600 records processed.
650 records processed.
700 records processed.

Number of entries: 387
1. Paper: 2106.02538 was successfully downloaded and extracted.
2. Paper: 2111.00231 was successfully downloaded and extracted.
Error for arXiv-2202.13215v1.pdf: file could not be opened successfully:
3. Paper: 2202.13215 was not downloaded/extracted.
4. Paper: 2205.05419 was successfully downloaded and extracted.
File name was not found. (2206.13773)
5. Paper: 2206.13773 was not downloaded/extracted.
6. Paper: 2207.10947 was successfully downloaded and extracted.
Error for arXiv-2301.10617v3.gz: file could not be opened 

113. Paper: 2409.06016 was successfully downloaded and extracted.
114. Paper: 2409.06364 was successfully downloaded and extracted.
115. Paper: 2409.07465 was successfully downloaded and extracted.
116. Paper: 2409.08201 was successfully downloaded and extracted.
117. Paper: 2409.09369 was successfully downloaded and extracted.
118. Paper: 2409.12014 was successfully downloaded and extracted.
Error for arXiv-2409.12124v2.pdf: file could not be opened successfully:
119. Paper: 2409.12124 was not downloaded/extracted.
120. Paper: 2409.13096 was successfully downloaded and extracted.
121. Paper: 2409.13221 was successfully downloaded and extracted.
122. Paper: 2409.13527 was successfully downloaded and extracted.
123. Paper: 2409.13740 was successfully downloaded and extracted.
124. Paper: 2409.14055 was successfully downloaded and extracted.
125. Paper: 2409.14509 was successfully downloaded and extracted.
126. Paper: 2409.14590 was successfully downloaded and extracted.
127. Paper: 2409

226. Paper: 2409.17484 was successfully downloaded and extracted.
227. Paper: 2409.17485 was successfully downloaded and extracted.
228. Paper: 2409.17487 was successfully downloaded and extracted.
229. Paper: 2409.17488 was successfully downloaded and extracted.
230. Paper: 2409.17494 was successfully downloaded and extracted.
231. Paper: 2409.17503 was successfully downloaded and extracted.
232. Paper: 2409.17504 was successfully downloaded and extracted.
233. Paper: 2409.17512 was successfully downloaded and extracted.
Error for arXiv-2409.17516v1.pdf: file could not be opened successfully:
234. Paper: 2409.17516 was not downloaded/extracted.
235. Paper: 2409.17523 was successfully downloaded and extracted.
Error for arXiv-2409.17525v1.pdf: file could not be opened successfully:
236. Paper: 2409.17525 was not downloaded/extracted.
237. Paper: 2409.17527 was successfully downloaded and extracted.
238. Paper: 2409.17533 was successfully downloaded and extracted.
239. Paper: 2409.17539

341. Paper: 2409.17912 was successfully downloaded and extracted.
342. Paper: 2409.17917 was successfully downloaded and extracted.
343. Paper: 2409.17920 was successfully downloaded and extracted.
344. Paper: 2409.17922 was successfully downloaded and extracted.
345. Paper: 2409.17929 was successfully downloaded and extracted.
Error for arXiv-2409.17931v1.pdf: file could not be opened successfully:
346. Paper: 2409.17931 was not downloaded/extracted.
347. Paper: 2409.17937 was successfully downloaded and extracted.
348. Paper: 2409.17939 was successfully downloaded and extracted.
349. Paper: 2409.17943 was successfully downloaded and extracted.
Error for arXiv-2409.17952v1.pdf: file could not be opened successfully:
350. Paper: 2409.17952 was not downloaded/extracted.
351. Paper: 2409.17954 was successfully downloaded and extracted.
352. Paper: 2409.17977 was successfully downloaded and extracted.
353. Paper: 2409.17978 was successfully downloaded and extracted.
354. Paper: 2409.17980

In [None]:
# Select spec, month and year
spec = "cs"
month = "05"
year = "2024"

# Download papers from a specific month
for day in range(1, 31):
    time.sleep(6)
    if i < 10:
        query_arxiv(spec, f"{year}-{month}-0{i}")
    else:
        query_arxiv(spec, f"{year}-{month}-{i}")