In [1]:
import requests
import tarfile
import shutil
import os
import re
from lxml import etree
from lxml import html

In [2]:
# Variables
start_result = 0
max_results = 100
start_date = "2020-01-01"
end_date = "2024-09-30"
directory = "source_files/"

In [3]:
#Statistics
count_success = 0
count_failed = 0
count_not_downloaded = 0

# Request query
url = f'http://export.arxiv.org/api/query?search_query=submittedDate:[{start_date} TO {end_date}]&start={start_result}&max_results={max_results}'

# Creating directory
os.makedirs(directory, exist_ok=True)

# Request response
response = requests.get(url)
root = etree.fromstring(response.content)

#print(etree.tostring(root, pretty_print=True, encoding='unicode'))

# Locate the <id> tag of <entry>
namespaces = {'atom': 'http://www.w3.org/2005/Atom'}
entry_ids = root.xpath('//atom:entry/atom:id/text()', namespaces=namespaces)
entry_titles = root.xpath('//atom:entry/atom:title/text()', namespaces=namespaces)
entry_stems = []

# Store the extracted entry ID
#print("\n\nFound ids:")
for found_id in entry_ids:
    path_stem = found_id.split('abs/')[1]
    #print(path_stem)
    entry_stems.append(path_stem)

print("Number of entries: " + len(entry_stems))

# Download the found ids and extract gz archive
for i in range(len(entry_stems)):
    # Downloading
    source_link = "https://arxiv.org/src/" + entry_stems[i]
    response = requests.get(source_link)
    content_disposition = response.headers.get('content-disposition', '')
    filename_match = re.search(r'filename="(.+)"', content_disposition)
    if filename_match:
        filename = filename_match.group(1)
        if re.fullmatch(r".*\.tar.gz", filename):
            print(f"{i+1}. File name: {filename}")
        else:
            print(f"{i+1}. File is of unexpected data format.")
            count_not_downloaded += 1
            continue
    else:
        print(f"{i+1}. File name was not found.")
        count_not_downloaded += 1
        continue
    filepath = directory + filename
    with open(filepath, "wb") as file:
        file.write(response.content)
    
    # Extracting
    try:
        extracting_path = directory + entry_stems[i].replace('/', '-')
        t = tarfile.open(filepath)
        t.extractall(path = extracting_path)
        t.close()
        count_success += 1
        
        # Saving meta information
        with open(extracting_path + "/FR_META_INFORMATION.txt", "w", encoding="utf-8") as meta_file:
            meta_file.write(entry_ids[i] + "\n" + entry_titles[i].replace("\n", "").replace("  ", " "))
        
    except Exception as e:
        print("Error for " + filename + ": " + str(e).split("\n")[0])
        count_failed += 1       
        if os.path.isdir(paper_path):
            shutil.rmtree(paper_path) # Removing the paper
        
    # Deleting downloaded archive/file
    os.remove(filepath)
    
print("\nSuccess")
print(f"Number of successfully extracted archives: {count_success}")
print(f"Number of failed extracted archives      : {count_failed}")
print(f"Number of not downloaded files           : {count_not_downloaded}")

100
File name: arXiv-2003.00001v1.tar.gz
File is of unexpected data format.
File name: arXiv-2003.00003v1.tar.gz
File is of unexpected data format.
File name: arXiv-2003.00005v2.tar.gz
File name: arXiv-2003.00006v1.tar.gz
File name: arXiv-2003.00007v2.tar.gz
File name: arXiv-2003.00008v2.tar.gz
File is of unexpected data format.
File name: arXiv-2003.00010v2.tar.gz
File name: arXiv-2003.00011v2.tar.gz
File name: arXiv-2003.00012v2.tar.gz
File name: arXiv-2003.00013v3.tar.gz
File name: arXiv-2003.00014v2.tar.gz
File name: arXiv-2003.00015v1.tar.gz
File name: arXiv-2003.00016v1.tar.gz
File name: arXiv-2003.00017v4.tar.gz
File name: arXiv-2003.00018v1.tar.gz
File name: arXiv-2003.00019v2.tar.gz
File name: arXiv-2003.00020v1.tar.gz
File name: arXiv-2003.00021v3.tar.gz
File name: arXiv-2003.00022v3.tar.gz
File name: arXiv-2003.00023v2.tar.gz
File name: arXiv-2003.00024v1.tar.gz
File name: arXiv-2003.00025v2.tar.gz
File name: arXiv-2003.00026v3.tar.gz
File name: arXiv-2003.00027v2.tar.gz
Fil

In [5]:
#Statistics
count_common = 0
count_not_common = 0
count_unknown = 0

for paper in os.listdir(directory):
    paper_path = directory + paper
    try:
        if os.path.isdir(paper_path):
            meta_file = open(paper_path + "/FR_META_INFORMATION.txt", "r", encoding="utf-8")
            paper_url = meta_file.readline().replace("\n", "")
            meta_file.close()

            html_response = requests.get(paper_url)
            if html_response.status_code != 200:
                print(f"Failed to retrieve page with status code: {response.status_code}")
                continue

            tree = html.fromstring(html_response.content)
            license_link = tree.xpath('//a[@title="Rights to this article"]/@href')

            if license_link:
                license_url = license_link[0]
                #print(f"License URL: {license_url}")

                if license_url == "http://arxiv.org/licenses/nonexclusive-distrib/1.0/":
                    print(f"Paper {paper} has no Creative Commons license and is dismissed.")
                    count_not_common += 1
                    shutil.rmtree(paper_path)
                elif (license_url == "http://creativecommons.org/licenses/by/4.0/" or
                    license_url == "http://creativecommons.org/licenses/by-sa/4.0/" or
                    license_url == "http://creativecommons.org/licenses/by-nc-sa/4.0/" or
                    license_url == "http://creativecommons.org/licenses/by-nc-nd/4.0/" or
                    license_url == "http://creativecommons.org/public-domain/cc0/"):
                        print(f"Paper {paper} has a Creative Commons license.")
                        count_common += 1
                else:
                    print(f"Paper {paper} has an unexpected license: {license_url}")
                    count_unknown += 1
            else:
                print(f"License information for {paper} was not found.")
                count_unknown += 1
    except Exception as e:
        print("Error for " + paper + ": " + str(e).split("\n")[0])
            
print("\nSuccess")
print(f"Number of papers with a Creative Commons license : {count_common}")
print(f"Number of papers with no Creative Commons license: {count_not_common}")
print(f"Number of papers with an unknown license:        : {count_unknown}")

Paper 2003.00001v1 has no Creative Commons license and is dismissed.
Paper 2003.00003v1 has no Creative Commons license and is dismissed.
Paper 2003.00005v2 has no Creative Commons license and is dismissed.
Paper 2003.00006v1 has no Creative Commons license and is dismissed.
Paper 2003.00007v2 has no Creative Commons license and is dismissed.
Paper 2003.00008v2 has a Creative Commons license.
Paper 2003.00010v2 has a Creative Commons license.
Paper 2003.00011v2 has no Creative Commons license and is dismissed.
Paper 2003.00012v2 has no Creative Commons license and is dismissed.
Paper 2003.00013v3 has no Creative Commons license and is dismissed.
Paper 2003.00014v2 has no Creative Commons license and is dismissed.
Paper 2003.00015v1 has no Creative Commons license and is dismissed.
Paper 2003.00016v1 has no Creative Commons license and is dismissed.
Paper 2003.00017v4 has no Creative Commons license and is dismissed.
Paper 2003.00018v1 has no Creative Commons license and is dismissed.
P