In [None]:
import glob
import itertools
import logging
import pandas
import xml.etree.ElementTree as ET

from dataclasses import dataclass

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# Get list of all xml files from the data folder
dirname = "data/Dogmatics/ChurchDogmatics/"
file_pattern = "*.xml"
xml_files: list[str] = glob.glob(f"{dirname}{file_pattern}")
xml_files

In [None]:
@dataclass()
class TextChunk:
    text: str
    excursus: bool
    title: str
    group: str
    volume: str
    date: str
    biblScope: str
    sectionhead = str

In [None]:
def get_metadata(root) -> dict:
    metadata = {}
    metadata["title"] = root.find("./teiHeader/fileDesc/titleStmt/title").text
    metadata["source"] = root.find('.//*[@type="source"]').text
    metadata["group"] = root.find('.//*[@type="group"]').text
    metadata["volumefull"] = root.find('.//*[@type="volumefull"]').text
    metadata["volumealt"] = root.find('.//*[@type="volumealt"]').text
    metadata["date"] = root.find("./teiHeader/fileDesc/sourceDesc/bibl/date").text
    metadata["biblScope"] = root.find(
        "./teiHeader/fileDesc/sourceDesc/bibl/biblScope"
    ).text
    return metadata

In [None]:
def get_paragraphs(root, metadata) -> list[TextChunk]:
    paragraphs = []
    for div in root.findall(".//text/body/div"):
        # Skip the abstract
        if div.get("type") == "abstract":
            continue
        else:
            # Check if the paragraph is an excursus
            if div.get("type") == "excursus":
                excursus = True
            else:
                excursus = False
            for p in div.findall("p"):
                paragraphs.append(
                    TextChunk(
                        text=p.text,
                        excursus=excursus,
                        title=metadata["title"],
                        group=metadata["group"],
                        volume=metadata["volumefull"],
                        date=metadata["date"],
                        biblScope=metadata["biblScope"],
                        # sectionhead=metadata["sectionhead"],
                    )
                )
    return paragraphs

In [None]:
def parse_doc(fn):
    # Get xml tree, and extract metadata and content
    try:
        tree = ET.parse(fn)
        root = tree.getroot()

        # Log the filename being processed
        logging.info(f"Processing file: {fn}")

        metadata = get_metadata(root)
        paragraphs = get_paragraphs(root, metadata)
        return [vars(p) for p in paragraphs]
     
    except ET.ParseError as e:
        # Log any parsing errors
        logging.error(f"Error parsing file: {fn}")
        logging.error(str(e))

    except Exception as e:
        # Log any other exceptions
        logging.error(f"Error processing file: {fn}")
        logging.error(str(e))

In [None]:
# Iterate over xml filenames and parse each file
data = [parse_doc(fn) for fn in xml_files]
data[0]

In [None]:
filtered_data = [doc for doc in data if doc is not None]
combined_data = list(itertools.chain(*filtered_data))
len(combined_data)

In [None]:
combined_data[0]

In [None]:
# Create a pandas dataframe from the combined data
df = pandas.DataFrame(combined_data)
df

In [None]:
df.to_parquet("dogmatics-text-w-meta.parquet", index=False)

In [None]:
# Count the number of documents in each volume
df["volume"].value_counts()

In [None]:
# Average text length over all docs
avg_text_length = df["text"].str.len().mean()
avg_text_length

In [None]:
# Check titles for variations on Editors' preface
contains_editor = df[df["title"].str.contains("Editor", case=False)]
contains_editor["title"].value_counts()

In [None]:
# Remove "Editors' preface" based on title, and recount docs
df_no_editors = df[df["title"] != "Editors' Preface"]
avg_text_length = df_no_editors["text"].str.len().mean()
avg_text_length


In [None]:
# Write final data, without "Editors' Preface"s to parquet
df_no_editors.to_parquet("dogmatics-text-w-meta-no-editors.parquet", index=False)