In [1]:
import xml.etree.ElementTree as xml_tree
import os
from tqdm.notebook import tqdm

In [2]:
def get_parallel_corpus(xml_file):
    try:
        tree = xml_tree.parse(xml_file)
        root = tree.getroot()
        assert root.get("orl") == "ja" and root.get("trl") == "en"
        file_id = root.find("inf").text
        title_data = root.find("tit")
        title_ja = title_data.find("j").text
        title_en_data = title_data.findall("e")[-1]
        assert (title_en_data.attrib["type"] == "check")
        title_en = title_en_data.text

        sentences = root.findall("./par/sen")
        parallel_corpus = []
        for s in sentences:
            source_s = s.find("j").text
            target_s_data = s.findall("e")[-1]
            assert (target_s_data.attrib["type"] == "check")
            target_s = target_s_data.text
            parallel_corpus.append((source_s, target_s))
        return parallel_corpus, file_id, (title_ja, title_en)
    except xml_tree.ParseError:
        return None

In [3]:
def get_combined_parallel_corpora(category):
    for (dirpath, dirnames, filenames) in os.walk(category):
        # extend filenames only
        parallel_corpus = []
        for file in tqdm(filenames):
            data = get_parallel_corpus(category + "/" + file)
            if data is not None:
                corpora, _, title_both = data
                parallel_corpus.append(title_both)
                parallel_corpus.extend(corpora)
        # first level only
        break
    return parallel_corpus

In [4]:
def save_corpus_as_csv(corpus, path):
    with open(path, "wb+") as csv_file:
        header_str = f'ja_source, en_target\n'
        csv_file.write(header_str.encode("utf-8"))
        for (source, target) in corpus:
            save_str = f'"{source}", "{target}"\n'
            csv_file.write(save_str.encode("utf-8"))
    return

In [5]:
def get_categories(base_path=r"./data/wiki_corpus_2.01"):
    for (dirpath, dirnames, filenames) in os.walk(base_path):
        break
    return dirnames

In [6]:
def create_category_csv(category, base_path=r"./data/wiki_corpus_2.01", out_folder=r"./data-post/wiki_corpus_2.01"):
    assert category in get_categories(base_path=base_path), f"category [{category}] does not exist!"
    if os.path.exists(f"{out_folder}/{category}.csv"):
        print(f"skipping file [{out_folder}/{category}.csv], file already exists!")
        return
    corpus = get_combined_parallel_corpora(f"{base_path}/{category}")
    save_corpus_as_csv(corpus, f"{out_folder}/{category}.csv")
    return

In [7]:
def create_all_csv(base_path=r"./data/wiki_corpus_2.01", out_folder=r"./data-post/wiki_corpus_2.01"):
    for category in tqdm(get_categories(base_path=base_path)):
        create_category_csv(category, base_path=base_path, out_folder=out_folder)
    return 

In [8]:
# BASE FOLDER
PATH = r"./data/wiki_corpus_2.01/"
CATEGORY = "CLT"

# create_category_csv(CATEGORY)
create_all_csv()

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1061 [00:00<?, ?it/s]

  0%|          | 0/496 [00:00<?, ?it/s]

  0%|          | 0/2044 [00:00<?, ?it/s]

  0%|          | 0/772 [00:00<?, ?it/s]

  0%|          | 0/232 [00:00<?, ?it/s]

  0%|          | 0/343 [00:00<?, ?it/s]

  0%|          | 0/1966 [00:00<?, ?it/s]

  0%|          | 0/678 [00:00<?, ?it/s]

  0%|          | 0/4678 [00:00<?, ?it/s]

  0%|          | 0/290 [00:00<?, ?it/s]

  0%|          | 0/191 [00:00<?, ?it/s]

  0%|          | 0/534 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/413 [00:00<?, ?it/s]

  0%|          | 0/367 [00:00<?, ?it/s]