In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import re
import os

In [2]:
def clean_html(path_to_input: str, path_to_output: str) -> str:
    # Open html file with text
    with open(path_to_input, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "lxml")

    # --------- CLEANUP: REMOVE unwanted elements ---------
    # Find all <math> elements (MathML equations)
    math_tags = soup.find_all("math")

    # Replace each <math> with [[[formula]]]
    for tag in math_tags:
        tag.replace_with(" [[[formula]]] ")

    # Find all <cite> elements (MathML equations)
    cite_tags = soup.find_all("cite")

    # Replace each <cite> with [[[cite]]]
    for tag in cite_tags:
        tag.replace_with(" [[[cite]]] ")

    # --------- BODY TEXT ---------
    # You can extract body text from paragraphs or specific divs
    paragraphs = soup.find_all(["p", "div"], class_="ltx_para")
    text_parts = [p.get_text(separator=" ", strip=True) for p in paragraphs if p.get_text(strip=True)]

    # Combine into full text
    text = "\n\n".join(text_parts)

    text = re.sub(r'Eqs?\.\s*\(\s*\d+\s*\)(?:\s*[–-]\s*\(\s*\d+\s*\))?', '[[[Equation Reference]]]', text)
    text = re.sub(
        r'Figs?\.\s*(?:S)?\d+(?:\s*(?:\([a-z]\)|\([a-z]\)-\([a-z]\)))?(?:\s*(?:and|–|-)\s*(?:S)?\d+(?:\s*\([a-z]\))?)?',
        '[[[Figure Reference]]]',
        text,
        flags=re.IGNORECASE
    )
    text = re.sub(
        r'(?:Sec(?:tion)?\.?|Appendix)\s+(?:[A-Z]+|\d+)(?:\.(?:\d+|[A-Z]+)){0,3}(?:\s+in\s+\[?SI\]?)?',
        '[[[Sequence Reference]]]',
        text,
        flags=re.IGNORECASE
    )
    text = re.sub(r'\(\s*\d+\s*\)', '', text)

    os.makedirs(os.path.dirname(path_to_output), exist_ok=True)  # Create output directory if it doesn't exist
    # Write prcoessed text to file
    with open(path_to_output, 'w', encoding='utf-8') as out_f:
        out_f.write(text)
    print(f"{path_to_output} saved!")

In [3]:
def process_files_in_parallel():
    """
    Основная функция для параллельной обработки файлов.
    """
    base_input_dir = Path("/home/kdemyokhin_1/concept-tree-course-work/articles_raw/arxiv-html-cs/")  # Исходная директория
    base_output_dir = Path("/home/kdemyokhin_1/concept-tree-course-work/articles_parsed/arxiv-txt-cs/")  # Целевая директория

    # Рекурсивно находим все HTML-файлы
    html_files = list(base_input_dir.rglob('*.html'))
    print(f"Found {len(html_files)} HTML files in input directory.")

    if not html_files:
        print("No HTML files found. Please check the input directory.")
        return

    tasks = []
    for path in html_files:
        relative_path = path.relative_to(base_input_dir)
        output_path = base_output_dir / relative_path.with_suffix('.txt')
        tasks.append((str(path), str(output_path)))

    print(f"Scheduled {len(tasks)} tasks for processing.")

    # Используем ProcessPoolExecutor для параллельной обработки
    try:
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:  # Укажите количество процессов
            futures = {executor.submit(clean_html, task[0], task[1]): task for task in tasks}

            for future in as_completed(futures):
                task = futures[future]
                try:
                    result = future.result()
                    if result is None:
                        print(f"Task for file {task[0]} returned None (possible error).")
                except Exception as e:
                    print(f"Exception occurred during processing of file {task[0]}: {e}")

    except Exception as e:
        print(f"An error occurred during parallel processing: {e}")

In [4]:
process_files_in_parallel()

Found 0 HTML files in input directory.
No HTML files found. Please check the input directory.
