In [2]:
import requests
import tarfile
import shutil
import time
import csv
import os
import re
from lxml import etree
from requests.exceptions import HTTPError

In [3]:
# Fixed variables
directory = "source_files/"
metadata_file_path = directory + "papers.csv"

relevant_publication_types = ["JournalArticle", "Conference", "Review"]
MIN_CREATED_YEAR = 2023

# Creating directory
os.makedirs(directory, exist_ok=True)

# Creating metadata file
if os.path.isfile(metadata_file_path) == False:
    open(metadata_file_path, "w").close()

In [None]:
def query_arxiv(spec, date):
    
    #Statistics
    count_success = 0
    count_failed = 0

    # Request query
    url = f'https://export.arxiv.org/oai2?verb=ListRecords&set={spec}&from={date}&until={date}&metadataPrefix=arXiv'
    print(url)

    # Request response
    response = requests.get(url)
    root = etree.fromstring(response.content)

    #print(etree.tostring(root, pretty_print=True, encoding='unicode'))
    
    entry_ids = []
    entry_titles = []
    
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'arxiv': 'http://arxiv.org/OAI/arXiv/'
    }
    record_list = root.findall('.//oai:ListRecords/oai:record', namespaces=namespaces)
    print("\nFound records:",len(record_list))
    record_counter = 0
    for record in record_list:
        record_counter += 1
        if record_counter % 50 == 0:
            print(str(record_counter) + " records processed.")
        
        try:
            fid = record.find('.//arxiv:id', namespaces=namespaces).text
            title = record.find('.//arxiv:title', namespaces=namespaces).text.replace("\n", "").replace("  ", " ")
            license = record.find('.//arxiv:license', namespaces=namespaces).text
            created = record.find('.//arxiv:created', namespaces=namespaces).text
            #print(fid.text, title.text.replace("\n", "").replace("  ", " "), license.text, created.text)

            year_match = re.match("(\d\d\d\d)-\d\d\-\d\d", created)
            if year_match:
                if int(year_match.group(1)) >= MIN_CREATED_YEAR:
                    if (license == "http://creativecommons.org/licenses/by/4.0/" or
                    license == "http://creativecommons.org/licenses/by-sa/4.0/" or
                    license == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                    license == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                    license == "http://creativecommons.org/publicdomain/zero/1.0/"):
                        if check_new_paper(fid):
                            entry_ids.append(fid)
                            entry_titles.append(title)
                        #print("Success!")
                    elif (license != "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"):
                        print("Bad license:", license.text)
        except Exception as e:
            print(f"Error Type: {type(e).__name__}")
            error_message = str(e)[:100]
            print(f"Error Message: {error_message}")

    print("\nNumber of entries:", len(entry_ids))
    
    # Downloading the papers
    for i in range(len(entry_ids)):
        time.sleep(3)
        if download_source_paper(entry_ids[i], entry_titles[i], spec):
            print(f"{i+1}. Paper: {entry_ids[i]} was successfully downloaded and extracted.")
            count_success += 1
        else:
            print(f"{i+1}. Paper: {entry_ids[i]} was not downloaded/extracted.")
            count_failed += 1
        #break
    
    print("\nStatistics:")
    print(f"Successfully downloaded papers: {count_success}")
    print(f"Failed downloaded papers: {count_failed}")
    
# Download the found ids and extract tar.gz archive
def download_source_paper(entry_id, entry_title, spec):       
    # Downloading
    source_link = "https://arxiv.org/src/" + entry_id

    response = requests.get(source_link)
    #print(response.content)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        if re.fullmatch(r".*\.tar.gz", filename) == False:
            print(f"File is of unexpected data format. ({filename})")
            return False
    else:
        print(f"File name was not found. ({entry_id})")
        return False
    filepath = directory + filename
    with open(filepath, "wb") as file:
        file.write(response.content)

    # Extracting
    try:
        extracting_path = directory + entry_id.replace('/', '-')
        t = tarfile.open(filepath)
        t.extractall(path = extracting_path)
        t.close()

        # Saving meta information
        with open(metadata_file_path, "a", encoding="utf-8") as meta_file:
            meta_file.write(entry_id + ";" + entry_title + ";" + spec + "\n")

    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        try:
            if os.path.isdir(extracting_path):
                shutil.rmtree(extracting_path) # Removing the paper
            os.remove(filepath)
        except:
            pass
        return False

    # Deleting downloaded archive/file
    os.remove(filepath)
    
    return True

# Check whether the paper is new or already downloaded
def check_new_paper(entry_id):
    with open (metadata_file_path, "r") as metadata_file:
        spamreader = csv.reader(metadata_file, delimiter=';', quotechar='|')
        for row in spamreader:
            if row[0] == entry_id:
                return False
    return True

In [None]:
# Select spec and date
spec = "cs"
date = "2024-09-27"

# Download papers from a specific date
query_arxiv(spec, date)

In [None]:
# Select spec, month and year
spec = "cs"
month = "05"
year = "2024"

# Download papers from a specific month
for day in range(1, 31):
    time.sleep(6)
    if day < 10:
        query_arxiv(spec, f"{year}-{month}-0{day}")
    else:
        query_arxiv(spec, f"{year}-{month}-{day}")

In [4]:
paper_ids = []
paper_titles = []

with open(directory + "papers_processed.csv") as metadata_file:
    csv_reader = csv.reader(metadata_file, delimiter=';', quotechar='|')
    for row in csv_reader:
        paper_ids.append(row[0])
        paper_titles.append(row[0])
        
len(paper_ids)

11880

In [None]:
start_index = 0
end_index = 5#len(paper_ids)

for i in range(start_index, end_index):
    print(i)
    download_source_paper(paper_ids[i], paper_titles[i], "cs")