In [1]:
import os
import boto3

# Initialize a session using Amazon S3
session = boto3.session.Session(
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
)

# Create an S3 client
s3 = session.client('s3')

# Define the bucket name and the key of the manifest file
bucket_name = 'arxiv'

In [2]:
def download_arxiv_file(origin_path, target_path=None, verbose=True):
    if target_path is None:
        target_path = origin_path
    target_path = './downloaded/' + target_path
    # Ensure the directory exists
    directory = os.path.dirname(target_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    if os.path.exists(target_path):
        if verbose:
            print(f'🐤 {origin_path} already downloaded to {target_path}')
        return
    try:
        s3.download_file(bucket_name, origin_path, target_path, ExtraArgs={'RequestPayer': 'requester'})
        if verbose:
            print(f'🥳 Download ({origin_path}) successfully.')
    except Exception as e:
        print(f"❗️ An error occurred: {e}")

In [3]:
download_arxiv_file('pdf/arXiv_pdf_manifest.xml')
download_arxiv_file('src/arXiv_src_manifest.xml')

🥳 Download (pdf/arXiv_pdf_manifest.xml) successfully.
🥳 Download (src/arXiv_src_manifest.xml) successfully.


# Process PDF & SRC manifest files

In [4]:
import xmltodict

with open('./downloaded/pdf/arXiv_pdf_manifest.xml', 'r') as manifest_pdf_file:
    manifest_pdf_dict = xmltodict.parse(manifest_pdf_file.read())

with open('./downloaded/src/arXiv_src_manifest.xml', 'r') as manifest_src_file:
    manifest_src_dict = xmltodict.parse(manifest_src_file.read())

manifest_pdf_files = manifest_pdf_dict['arXivPDF']['file']
manifest_src_files = manifest_src_dict['arXivSRC']['file']

len(manifest_pdf_files), len(manifest_src_files)

(6908, 7381)

In [5]:
manifest_files = manifest_pdf_files + manifest_src_files
len(manifest_files)

14289

In [9]:
sample_dates = [
    '0001',
    '0002',
    '0003',
    '0004',
    '0005',
    '0006',
    '0007',
    '0008',
    '0009',
    '0010',
    '0011',
    '0012',
    '0501',
    '1001',
    '1501',
    '2001',
]

sample_files = { date: [item for item in manifest_files if date in item['filename']] for date in sample_dates }
sample_filenames = [item['filename'] for sample_date in sample_files.keys() for item in sample_files[sample_date]]
sample_filenames.sort()
sample_filenames

['pdf/arXiv_pdf_0001_001.tar',
 'pdf/arXiv_pdf_0001_002.tar',
 'pdf/arXiv_pdf_0002_001.tar',
 'pdf/arXiv_pdf_0002_002.tar',
 'pdf/arXiv_pdf_0003_001.tar',
 'pdf/arXiv_pdf_0003_002.tar',
 'pdf/arXiv_pdf_0004_001.tar',
 'pdf/arXiv_pdf_0004_002.tar',
 'pdf/arXiv_pdf_0005_001.tar',
 'pdf/arXiv_pdf_0005_002.tar',
 'pdf/arXiv_pdf_0006_001.tar',
 'pdf/arXiv_pdf_0006_002.tar',
 'pdf/arXiv_pdf_0007_001.tar',
 'pdf/arXiv_pdf_0007_002.tar',
 'pdf/arXiv_pdf_0008_001.tar',
 'pdf/arXiv_pdf_0008_002.tar',
 'pdf/arXiv_pdf_0011_001.tar',
 'pdf/arXiv_pdf_0011_002.tar',
 'pdf/arXiv_pdf_0012_001.tar',
 'pdf/arXiv_pdf_0012_002.tar',
 'pdf/arXiv_pdf_0501_001.tar',
 'pdf/arXiv_pdf_0501_002.tar',
 'pdf/arXiv_pdf_0501_003.tar',
 'pdf/arXiv_pdf_1001_001.tar',
 'pdf/arXiv_pdf_1001_002.tar',
 'pdf/arXiv_pdf_1001_003.tar',
 'pdf/arXiv_pdf_1001_004.tar',
 'pdf/arXiv_pdf_1001_005.tar',
 'pdf/arXiv_pdf_1001_006.tar',
 'pdf/arXiv_pdf_1001_007.tar',
 'pdf/arXiv_pdf_1501_001.tar',
 'pdf/arXiv_pdf_1501_002.tar',
 'pdf/ar

In [7]:
# # The following files might be missing from the manifest file. So download them manually.
# # Comment out this block if you don't want to override the download list.
# sample_filenames = [
#     'pdf/arXiv_pdf_0009_001.tar',
#     'pdf/arXiv_pdf_0009_002.tar',
#     'pdf/arXiv_pdf_0010_001.tar',
#     'pdf/arXiv_pdf_0010_002.tar',
# ]

In [11]:
from tqdm.notebook import tqdm
import time

pbar = tqdm(sample_filenames, desc="Downloading files")
for filename in pbar:
    pbar.set_description(f"Downloading {filename}")
    download_arxiv_file(filename, verbose=False)

Downloading files:   0%|          | 0/161 [00:00<?, ?it/s]

# Download data pre-processing

In [12]:
import tarfile
import os
import re
import shutil

from tqdm.notebook import tqdm
import time

arxiv_output_dir = './arxiv'
temp_extraction_dir = './arxiv/temp_extracted_files'

# Function to extract tar files
def extract_tar(file_path, extraction_path=temp_extraction_dir):
    temp_output_path = os.path.join(extraction_path, os.path.basename(file_path).split('.')[0])
    with tarfile.open(file_path, 'r') as tar:
        tar.extractall(path=temp_output_path)
    return temp_output_path

# Function to organize files based on dates
def organize_by_date_and_type(filename, extracted_path):
    match = re.match(r"(pdf|src)/arXiv_.*_(\d{4})_.*\.tar", filename)
    if match:
        type_dir = match.group(1)
        date_dir = match.group(2)
        target_dir = os.path.join(arxiv_output_dir, date_dir, type_dir)

        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        temp_dir = os.path.join(extracted_path, date_dir)
        # Move the extracted files to the date and type directory
        for file in os.listdir(temp_dir):
            shutil.move(os.path.join(temp_dir, file), os.path.join(target_dir, file))

In [16]:
!pip install joblib-progress

Collecting joblib-progress
  Obtaining dependency information for joblib-progress from https://files.pythonhosted.org/packages/bb/21/154604847039c93296caea054eb4da9f80b5f86d9fca4a1058d8d84c5c00/joblib_progress-1.0.4-py3-none-any.whl.metadata
  Downloading joblib_progress-1.0.4-py3-none-any.whl.metadata (1.9 kB)
Downloading joblib_progress-1.0.4-py3-none-any.whl (3.3 kB)
Installing collected packages: joblib-progress
Successfully installed joblib-progress-1.0.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [17]:
import os
import subprocess
from joblib import Parallel, delayed
from joblib_progress import joblib_progress

print("CPU:", os.cpu_count())

if not os.path.exists(arxiv_output_dir):
    os.makedirs(arxiv_output_dir)
if os.path.exists(temp_extraction_dir):
    shutil.rmtree(temp_extraction_dir)
os.makedirs(temp_extraction_dir)

def inner_func(filename):
    temp_output_path = extract_tar('./downloaded/' + filename)
    organize_by_date_and_type(filename, temp_output_path)
    shutil.rmtree(temp_output_path)
    return filename

with joblib_progress("Processing...", total=len(sample_filenames)):
    Parallel(n_jobs=os.cpu_count(), pre_dispatch='1*n_jobs')(
        delayed(inner_func)(f) for f in sample_filenames
    )

print('Done!')

shutil.rmtree(temp_extraction_dir)

Output()

CPU: 64


Done!


In [18]:
!du -sh ./arxiv

74G	./arxiv
