In [5]:
import requests
import tarfile
import shutil
import time
import os
import re
from lxml import etree
from lxml import html

In [10]:
# Variables
start_result = 0
slice_size = 100
max_results = 1000
start_date = "2020-01-01"
end_date = "2024-09-30"
directory = "source_files/"

In [7]:
def check_license(paper_url):
    try:
        html_response = requests.get(paper_url)
        if html_response.status_code != 200:
            print(f"Failed to retrieve page with status code: {response.status_code}")
            return False

        tree = html.fromstring(html_response.content)
        license_link = tree.xpath('//a[@title="Rights to this article"]/@href')

        if license_link:
            license_url = license_link[0]

            if license_url == "http://arxiv.org/licenses/nonexclusive-distrib/1.0/":
                #print("Paper has no Creative Commons license and is dismissed.")
                return False
            elif (license_url == "http://creativecommons.org/licenses/by/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-sa/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                license_url == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                license_url == "http://creativecommons.org/publicdomain/zero/1.0/"): 
                    #http://creativecommons.org/public-domain/cc0/
                    #print(f"Paper has a Creative Commons license.")
                    return True
            else:
                print(f"Paper has an unexpected license: {license_url}")
                return False
        else:
            print("License information for paper was not found.")
            return False
    except Exception as e:
        print("Error for " + paper_url + ": " + str(e).split("\n")[0])
        return False

In [12]:
#Statistics
count_success = 0
count_failed = 0
count_not_downloaded = 0

# Creating directory
os.makedirs(directory, exist_ok=True)

for x in range(int(max_results / slice_size)):

    # Request query
    slice_start = start_result + slice_size * x
    url = f'http://export.arxiv.org/api/query?search_query=submittedDate:[{start_date} TO {end_date}]&start={slice_start}&max_results={slice_size}&sortBy=submittedDate&sortOrder=ascending'
    print(url)

    # Request response
    response = requests.get(url)
    root = etree.fromstring(response.content)

    #print(etree.tostring(root, pretty_print=True, encoding='unicode'))

    # Locate the <id> tag of <entry>
    namespaces = {'atom': 'http://www.w3.org/2005/Atom'}
    entry_ids = root.xpath('//atom:entry/atom:id/text()', namespaces=namespaces)
    entry_titles = root.xpath('//atom:entry/atom:title/text()', namespaces=namespaces)
    entry_stems = []

    # Store the extracted entry ID
    #print("\n\nFound ids:")
    for found_id in entry_ids:
        path_stem = found_id.split('abs/')[1]
        #print(path_stem)
        entry_stems.append(path_stem)

    print("Number of entries:", len(entry_stems))

    # Download the found ids and extract tar.gz archive if the paper has Creative Commons license
    for i in range(len(entry_stems)):
        break

        # Checking license
        if check_license(entry_ids[i]) == False:
            print(f"{x*slice_size+i+1}. Paper has no found Creative Commons license. ({entry_stems[i]})")
            count_not_downloaded += 1
            continue

        # Downloading
        source_link = "https://arxiv.org/src/" + entry_stems[i]

        response = requests.get(source_link)
        content_disposition = response.headers.get('content-disposition', '')
        filename_match = re.search(r'filename="(.+)"', content_disposition)
        if filename_match:
            filename = filename_match.group(1)
            if re.fullmatch(r".*\.tar.gz", filename):
                print(f"{x*slice_size+i+1}. File name: {filename}")
            else:
                print(f"{x*slice_size+i+1}. File is of unexpected data format. ({entry_stems[i]})")
                count_not_downloaded += 1
                continue
        else:
            print(f"{x*slice_size+i+1}. File name was not found. ({entry_stems[i]})")
            count_not_downloaded += 1
            continue
        filepath = directory + filename
        with open(filepath, "wb") as file:
            file.write(response.content)

        # Extracting
        try:
            extracting_path = directory + entry_stems[i].replace('/', '-')
            t = tarfile.open(filepath)
            t.extractall(path = extracting_path)
            t.close()
            count_success += 1

            # Saving meta information
            with open(extracting_path + "/FR_META_INFORMATION.txt", "w", encoding="utf-8") as meta_file:
                meta_file.write(entry_ids[i] + "\n" + entry_titles[i].replace("\n", "").replace("  ", " "))

        except Exception as e:
            print("Error for " + filename + ": " + str(e).split("\n")[0])
            count_failed += 1       
            if os.path.isdir(paper_path):
                shutil.rmtree(paper_path) # Removing the paper

        # Deleting downloaded archive/file
        os.remove(filepath)
        
    time.sleep(3)
    
print("\nSuccess")
print(f"Number of successfully extracted archives: {count_success}")
print(f"Number of failed extracted archives      : {count_failed}")
print(f"Number of not downloaded files           : {count_not_downloaded}")

http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=0&max_results=100&sortBy=submittedDate&sortOrder=ascending
Number of entries: 100
http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=100&max_results=100&sortBy=submittedDate&sortOrder=ascending
Number of entries: 0
http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=200&max_results=100&sortBy=submittedDate&sortOrder=ascending
Number of entries: 0
http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=300&max_results=100&sortBy=submittedDate&sortOrder=ascending
Number of entries: 0
http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=400&max_results=100&sortBy=submittedDate&sortOrder=ascending
Number of entries: 0
http://export.arxiv.org/api/query?search_query=submittedDate:[2020-01-01 TO 2024-09-30]&start=500&max_results=100&sortBy

KeyboardInterrupt: 