In [1]:
import requests
import tarfile
import shutil
import time
import os
import re
from lxml import etree
from lxml import html

In [37]:
# Fixed variables
directory = "source_files/"
MIN_CREATED_YEAR = 2020

# Creating directory
os.makedirs(directory, exist_ok=True)

In [None]:
def check_license(paper_url):
    try:
        html_response = requests.get(paper_url)
        if html_response.status_code != 200:
            print(f"Failed to retrieve page with status code: {response.status_code}")
            return False

        tree = html.fromstring(html_response.content)
        #print(etree.tostring(tree, pretty_print=True).decode("utf-8"))
        license_link = tree.xpath('//a[@title="Rights to this article"]/@href')

        if license_link:
            license_url = license_link[0]

            if license_url == "http://arxiv.org/licenses/nonexclusive-distrib/1.0/":
                #print("Paper has no Creative Commons license and is dismissed.")
                return False
            elif (license_url == "http://creativecommons.org/licenses/by/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-sa/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                license_url == "http://creativecommons.org/publicdomain/zero/1.0/"): 
                    #http://creativecommons.org/public-domain/cc0/
                    #print(f"Paper has a Creative Commons license.")
                    return True
            else:
                print(f"Paper has an unexpected license: {license_url}")
                return False
        else:
            print("License information for paper was not found.")
            return False
    except Exception as e:
        print("Error for " + paper_url + ": " + str(e).split("\n")[0])
        return False

In [55]:
def query_arxiv(spec, date):
    
    #Statistics
    count_success = 0
    count_failed = 0
    count_not_downloaded = 0

    # Request query
    url = f'https://export.arxiv.org/oai2?verb=ListRecords&set={spec}&from={date}&until={date}&metadataPrefix=arXiv'
    print(url)

    # Request response
    response = requests.get(url)
    root = etree.fromstring(response.content)

    #print(etree.tostring(root, pretty_print=True, encoding='unicode'))
    
    entry_ids = []
    entry_titles = []
    
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'arxiv': 'http://arxiv.org/OAI/arXiv/'
    }
    print("\n\nFound records:")
    record_list = root.findall('.//oai:ListRecords/oai:record', namespaces=namespaces)
    print(len(record_list))
    for record in record_list:
        fid = record.find('.//arxiv:id', namespaces=namespaces)
        title = record.find('.//arxiv:title', namespaces=namespaces)
        license = record.find('.//arxiv:license', namespaces=namespaces)
        created = record.find('.//arxiv:created', namespaces=namespaces)
        #print(fid.text, title.text, license.text, created.text)
        
        year_match = re.match("(\d\d\d\d)-\d\d\-\d\d", created.text)
        if year_match:
            if int(year_match.group(1)) >= MIN_CREATED_YEAR:
                if (license.text == "http://creativecommons.org/licenses/by/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-sa/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                license.text == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                license.text == "http://creativecommons.org/publicdomain/zero/1.0/"):
                    entry_ids.append(fid.text)
                    entry_titles.append(title.text)
                    print("Success!")
                else:
                    print("Bad license:", license.text)
            else:
                print("Bad year:", year_match.group(1))

    print("Number of entries:", len(entry_ids))
    
    # Downloading the papers
    for i in range(len(entry_ids)):
        time.sleep(1)
        download_source_paper(entry_ids[i], entry_titles[i])
        break
    
# Download the found ids and extract tar.gz archive if the paper has Creative Commons license
def download_source_paper(entry_id, entry_title):       
    # Downloading
    source_link = "https://arxiv.org/src/" + entry_id

    response = requests.get(source_link)
    print(response.content)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        if re.fullmatch(r".*\.tar.gz", filename):
            print(f"File name: {filename}")
        else:
            print(f"File is of unexpected data format. ({entry_id})")
            return False
    else:
        print(f"File name was not found. ({entry_id})")
        return False
    filepath = directory + filename
    with open(filepath, "wb") as file:
        file.write(response.content)

    # Extracting
    try:
        extracting_path = directory + entry_id.replace('/', '-')
        t = tarfile.open(filepath)
        t.extractall(path = extracting_path)
        t.close()
        count_success += 1

        # Saving meta information
        with open(extracting_path + "/FR_META_INFORMATION.txt", "w", encoding="utf-8") as meta_file:
            meta_file.write(entry_id + "\n" + entry_title)

    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        if os.path.isdir(paper_path):
            shutil.rmtree(paper_path) # Removing the paper
        os.remove(filepath)
        return False

    # Deleting downloaded archive/file
    os.remove(filepath)
    
    return True

In [56]:
query_arxiv("cs", "2024-09-30")

https://export.arxiv.org/oai2?verb=ListRecords&set=cs&from=2024-09-30&until=2024-09-30&metadataPrefix=arXiv


Found records:
600
Bad year: 2018
Bad year: 2018
Bad year: 2019
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Success!
Success!
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Success!
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Success!
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Success!
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Success!
Bad license: http://arxiv.org/licenses/nonexclusive-distrib/1.0/
Bad license: http://arxiv.org/licenses/nonexclusive-distr

b'\n       <html>\n     <head>\n       <title>arXiv reCAPTCHA</title>\n       <link rel="stylesheet" type="text/css" media="screen" href="https://static.arxiv.org/static/browse/0.3.2.8/css/arXiv.css?v=20220215" />\n       <script src="https://www.google.com/recaptcha/api.js" async defer></script>\n       <script>\n         var submitForm = function () {\n             document.forms[\'rrr\'].submit();\n         }\n       </script>\n     </head>\n\n     <body class="with-cu-identity">\n\n       <div id="cu-identity">\n         <div id="cu-logo">\n           <a href="https://www.cornell.edu/"><img src="https://static.arxiv.org/icons/cu/cornell-reduced-white-SMALL.svg"\n                                                   alt="Cornell University" width="200" border="0" /></a>\n         </div>\n         <div id="support-ack">\n           <a href="https://confluence.cornell.edu/x/ALlRF">We gratefully acknowledge support from<br />\n             the Simons Foundation and member institutions.</a