In [30]:
import os
from glob import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
from src.downloader import download_dataset

In [31]:
# configurations
download = False
dataset_dir = "./dataset"
source_dir = './dataset/wpcd/wp'
plaintext_output_dir = './output/clean_plaintext_articles'

In [32]:
if download:
    download_dataset(dataset_dir)

In [None]:
def extract_html_to_txt(source_folder, output_folder, valid_article_names):
    """
    Function to extract text from HTML files and save them as .txt.
    To do the cleaning, the function takes the html file, and reads only the innertext of <p> tags. This will remove redundant initial texts we find in the plaintext comes originally with the dataset.
    """
    if os.path.exists(output_folder):
        os.rmdir(plaintext_output_dir)
    os.makedirs(output_folder)

    # Get list of all HTML files, excluding those in the "index" subfolder
    html_files = glob(f'{source_folder}/**/*.htm', recursive=True)
    html_files = [f for f in html_files if 'index' not in f.split(os.sep)]

    for html_file_path in tqdm(html_files):
        article_name = os.path.splitext(os.path.basename(html_file_path))[0]
        if article_name not in valid_article_names:
            continue
        output_file_path = os.path.join(output_folder, article_name + '.txt')
        
        # Read HTML file and extract text from <p> tags
        with open(html_file_path, 'r', encoding='utf-8', errors='ignore') as html_file:
            soup = BeautifulSoup(html_file, 'html.parser')
            paragraphs = soup.find_all('p')
            plain_text = "\n".join([p.get_text() for p in paragraphs])
        
        with open(output_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(plain_text)

In [None]:
# There are additional .htm files in the wpcd directory. We will get the list of valid article names from articles.tsv to select the correct html files later
article_list_path = os.path.join(dataset_dir,"wikispeedia_paths-and-graph","articles.tsv")
article_names = []
with open(article_list_path, 'r') as file:
    for line in file:
        if line.startswith('#'):
            continue
        stripped_line = line.strip()
        if stripped_line:
            article_names.append(stripped_line)

extract_html_to_txt(source_dir, plaintext_output_dir, article_names)

100%|██████████| 5232/5232 [01:38<00:00, 53.21it/s] 


: 