In [1]:
import os

arxiv_output_dir = './arxiv'

In [2]:
date_folders = os.listdir(arxiv_output_dir)
date_folders.sort()

if '.DS_Store' in date_folders:
    date_folders.remove('.DS_Store')

date_folders

['0001',
 '0002',
 '0003',
 '0004',
 '0005',
 '0006',
 '0007',
 '0008',
 '0009',
 '0010',
 '0011',
 '0012',
 '0501',
 '1001',
 '1501',
 '2001']

In [3]:
from tqdm.notebook import tqdm

import os
import gzip
import tarfile
import shutil

from joblib import Parallel, delayed
from joblib_progress import joblib_progress

def extract_mixed_gz(gz_path, output_folder):
    """
    Extract a .gz file which could either contain a single file
    or multiple files (e.g., in the form of a .tar).

    Args:
    - gz_path: Path to the .gz file.
    - output_folder: Folder where the .gz file should be extracted to.

    Returns:
    None.
    """
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # First, we decompress the gz file
    decompressed_file_path = os.path.join(output_folder, os.path.basename(gz_path).replace('.gz', '') + '-temp')
    with gzip.open(gz_path, 'rb') as f_in:
        with open(decompressed_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    extraction_folder = os.path.join(output_folder, os.path.basename(gz_path).replace('.gz', ''))
    if not os.path.exists(extraction_folder):
        os.makedirs(extraction_folder)

    # Check if the decompressed result is a .tar file
    if tarfile.is_tarfile(decompressed_file_path):
        # Extract the tar file contents
        with tarfile.open(decompressed_file_path, 'r') as tar:
            tar.extractall(path=extraction_folder)
        # Remove the temporary decompressed tar file
        os.remove(decompressed_file_path)
    else:
        shutil.move(decompressed_file_path, os.path.join(extraction_folder, os.path.basename(gz_path).replace('.gz', '') + '.tex'))

def extract_gz_to_tex(date_dir):
    tex_output_dir = os.path.join(arxiv_output_dir, date_dir, 'tex')
    os.makedirs(os.path.join(arxiv_output_dir, date_dir, 'src_invalid'), exist_ok=True)
    pending_files = os.listdir(os.path.join(arxiv_output_dir, date_dir, 'src'))
    pending_files.sort()

    def inner_func(filename):
        try:
            src_path = os.path.join(arxiv_output_dir, date_dir, 'src', filename)
            if not os.path.exists(src_path) or os.path.getsize(src_path) == 0:
                return None
            filename_stem = os.path.splitext(filename)[0]
            filename_ext = os.path.splitext(filename)[1]
            if filename_ext == '.pdf':
                shutil.move(src_path, os.path.join(arxiv_output_dir, date_dir, 'src_invalid', filename))
            elif filename_ext == '.gz':
                extract_mixed_gz(src_path, tex_output_dir)
            elif filename_ext == '.tex':
                shutil.move(src_path, os.path.join(arxiv_output_dir, date_dir, 'tex', filename))
        except Exception as e:
            print('[ERROR]', str(e))
        return filename
    
    with joblib_progress("Preprocessing...", total=len(pending_files)):
        Parallel(n_jobs=os.cpu_count(), pre_dispatch='1*n_jobs')(
            delayed(inner_func)(f) for f in pending_files
        )

In [4]:
processing_dates = [
    '0001',
    '0002',
    '0003',
    '0004',
    '0005',
    '0006',
    '0007',
    '0008',
    '0009',
    '0010',
    '0011',
    '0012',
    '0501',
    '1001',
    '1501',
    '2001',
]

In [5]:
for date_folder_name in processing_dates:
    extract_gz_to_tex(date_folder_name)
    print(date_folder_name, len(os.listdir(os.path.join(arxiv_output_dir, date_folder_name, 'tex'))))

Output()

Output()

# LaTeXML

In [6]:
import os
import subprocess
import random  # Temp.
from joblib import Parallel, delayed
from joblib_progress import joblib_progress

def convert_tex_to_html(datedir):
    html_output_dir = os.path.join(arxiv_output_dir, datedir, 'html')
    if not os.path.exists(html_output_dir):
        os.mkdir(html_output_dir)
    
    log_output_dir = os.path.join(arxiv_output_dir, datedir, 'html_log')
    if not os.path.exists(log_output_dir):
        os.mkdir(log_output_dir)
    
    def inner_func(dirname):
        dirpath = os.path.join(arxiv_output_dir, datedir, 'tex', dirname)
        sub_files = os.listdir(dirpath)
        tex_files = [file for file in sub_files if file.lower().endswith('.tex')]
        if len(tex_files) == 0:
            return 'Not found'
        first_tex_filepath = os.path.join(dirpath, tex_files[0])
        output_path = os.path.join(html_output_dir, dirname + '.html')
        log_path = os.path.join(log_output_dir, dirname + '.log')
    
        if not os.path.exists(output_path):
            command = ['latexmlc', '--dest=' + output_path, '--log=' + log_path, first_tex_filepath]
            result = subprocess.run(' '.join(command), shell=True, stderr=subprocess.DEVNULL)

        if not os.path.exists(output_path):
            return 'Failed in LaTeXML'
    
        # If the output file is less than 5 KB, then we remove it.
        if os.path.getsize(output_path) < 4000:
            os.remove(output_path)
            return 'Invalid size'
        return 'OK'
    
    print("CPU:", os.cpu_count())
    pending_tex_files = os.listdir(os.path.join(arxiv_output_dir, datedir, 'tex'))
    pending_tex_files = random.sample(pending_tex_files, 800)
    pending_tex_files.sort()
    
    with joblib_progress("Processing...", total=len(pending_tex_files)):
        result = Parallel(n_jobs=os.cpu_count(), pre_dispatch='1*n_jobs')(
            delayed(inner_func)(dirname) for dirname in pending_tex_files
        )
        
    print('Done!')
    return result

In [7]:
def preprocess(date_folder_name):
    res_curr_date = convert_tex_to_html(date_folder_name)
    print(date_folder_name, sum([i == 'OK' for i in res_curr_date]), sum([i != 'OK' for i in res_curr_date]))
    return res_curr_date

In [None]:
result = preprocess('0001')

Output()

CPU: 64


In [11]:
preprocess('0002')

Output()

CPU: 64


Done!
0002 734 66


['OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'Failed in LaTeXML',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK'

In [12]:
preprocess('0003')

Output()

CPU: 64


Done!
0003 733 67


['OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Invalid size',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Invalid size',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'N

In [13]:
preprocess('0004')

Output()

CPU: 64


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Done!
0004 735 65


['OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Invalid size',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Invalid size',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'Not found',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'OK',
 'Failed in LaTeXML',
 'OK',
 'OK',
 'OK',
 'Invalid size',
 'OK',
 'OK',

In [12]:
preprocess('0005')

Output()

Done!
0005 0 800


In [13]:
preprocess('0006')

Output()

Done!
0006 0 800


In [14]:
preprocess('0007')

Output()

Done!
0007 0 800


In [15]:
preprocess('0008')

Output()

Done!
0008 0 800


In [16]:
preprocess('0009')

Output()

Done!
0009 0 800


In [17]:
preprocess('0010')

Output()

Done!
0010 0 800


In [18]:
preprocess('0011')

Output()

Done!
0011 0 800


In [19]:
preprocess('0012')

Output()

Done!
0012 0 800


In [20]:
preprocess('0501')

Output()

Done!
0501 0 800


In [21]:
preprocess('1001')

Output()

Done!
1001 0 800


In [22]:
preprocess('1501')

Output()

Done!
1501 0 800


In [23]:
preprocess('2001')

Output()

Done!
2001 0 800


In [24]:
# # This block is used to clean up unnecessary logs in the working dir.

# import glob
# import os

# # Find all .log files in the current directory
# log_files = glob.glob('./*.log')

# # Remove each file
# for file in log_files:
#     os.remove(file)
#     print(f"Removed {file}")