# Parsing TEI files

## 1 Installing needed packages

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
!pip install --quiet beautifulsoup4==4.12.3
!pip install --quiet lxml==5.0.0
!pip install --quiet pandas==2.1.4
!pip install --quiet sparqlwrapper==2.0.0
!pip install --quiet python-dateutil==2.9.0.post0
!pip install --quiet pyarrow==16.0.0
!pip install --quiet tqdm==4.66.4

from xml.sax.handler import ContentHandler
from xml.sax import make_parser
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
import concurrent.futures
import time
import re
import os

from utils import (
    check_xml,
    bkv_nkv_from_verse_id,
    gap_clean,
    get_data_from_tei,
)

## 2 Parsing

For parsing BeautifulSoup is used, as it is fast and reliable.
There are some dataclasses in use, which handle the objects 'Source', 'Verse' and 'Manuscript'. This is done to make the code more readable and easier to maintain. These classes are defined in `TEIFile.py`. 
The TEIFile class is used to work on/extract data from a given TEI file. The class is defined in `TEIFile.py`. 


## 3 Get data from TEI files

One goal is to extract all verses from all transcriptions (from NTVMR and IGNTP). This is done with the following code blocks.

### 3.1 Check files for validity

In [None]:
# xml parser init
parser = make_parser()
parser.setContentHandler(ContentHandler())

# get files
raw_files = sorted(Path("../data/transcriptions").rglob("*.xml"))
# raw_files = sorted(Path("../data/transcriptions/igntp/ecm_romans").rglob("*.xml"))
# raw_files = [Path("../data/transcriptions/ntvmr/40211.xml")]

# Store paths to good files
well_formed_files = []

# Execute tasks and gather results
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Submit tasks and collect futures
    futures = [executor.submit(check_xml, file_path, parser) for file_path in raw_files]

    # Gather results
    for future in concurrent.futures.as_completed(futures):
        try:
            result = future.result()
            if result is not None:
                well_formed_files.append(result)  # Append each result to the list
        except Exception as e:
            print(f"Error: {e}")  # Handle exceptions here

In [None]:
malformed_files = [x for x in raw_files if x not in well_formed_files]

print(malformed_files)

### 3.2 Get data from TEI file

#### 3.2.1 Data extraction from multiple TEI files in parallel

As there are many TEI files, it is necessary (for speed) to run the extraction of data in parallel. Use 'max_workers' to set number of cpu cores to be utilised.

In [7]:
# Create the directories if it doesn't exist
out_dirs = ["../data/parsed/man", "../data/parsed/trans"]
os.makedirs(out_dirs[0], exist_ok=True)
os.makedirs(out_dirs[1], exist_ok=True)

In [None]:
# Execute tasks and gather results
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Submit tasks and collect futures
    futures = [
        executor.submit(
            get_data_from_tei,
            file_path,
            clear_only=True,
            verbose=False,
            write_to_file=True,
            trans_out_dir=out_dirs[1],
            man_out_dir=out_dirs[0],
        )
        for file_path in well_formed_files
    ]

    # Initialize tqdm progress bar with total number of tasks
    progress_bar = tqdm(total=len(futures), desc="Processing")

    # Gather results
    for future, file_path in zip(
        concurrent.futures.as_completed(futures), well_formed_files
    ):
        # Update tqdm progress bar
        progress_bar.update(1)
        # Write currently running file path
        # progress_bar.write(f"Processing {file_path}...")

    # Close the progress bar
    progress_bar.close()

In [None]:
# Concatenating like this is done, as we know all files do have the same header.
# Also, this is computationally more efficient than first reading each file into a pd.DataFrame and then merging those into one.

# Construct the shell commands for concatenating
command1 = f"awk 'NR == 1 || FNR > 1' ../data/parsed/man/*.csv > ../data/parsed/manuscripts.csv"
command2 = (
    f"awk 'NR == 1 || FNR > 1' ../data/parsed/trans/*.csv > ../data/parsed/verses.csv"
)

# Execute the shell commands
!{command1}
!{command2}

In [None]:
verses_df = pd.read_csv(
    "../data/parsed/verses.csv",
    dtype={
        "lection": "string",
        "verse": "string",
        "transcript": "string",
        "publisher": "string",
        "source": "string",
        "ga": "string",
        "sponsor": "string",
        "founder": "string",
        "edition_version": "float",
        "edition_date": "string",
        "published_date": "string",
        "encoding_version": "float",
    },
)
manuscripts_df = pd.read_csv(
    "../data/parsed/manuscripts.csv",
    dtype={"ga": "string", "docID": "string", "label": "string", "source": "string"},
)

#### 3.2.3 Apply cleanup and aggregation functions to dataframes

In [None]:
# Apply the conversion function to the 'bkv' column in the DataFrame
verses_df = verses_df.apply(bkv_nkv_from_verse_id, axis=1)
verses_df.drop(columns=["verse"], inplace=True)
verses_df.dropna(subset=["transcript"], inplace=True)

In [None]:
# Measure the time it takes to apply the function
start_time = time.time()
# verses_df["ntvmrLink"] = verses_df.apply(generate_transcription_url, axis=1)
verses_df["text"] = verses_df["transcript"].apply(gap_clean)
end_time = time.time()
print(f"{end_time - start_time:.4f} seconds")

In [None]:
# manuscripts_df["docID"] = manuscripts_df.apply(ga_to_docID, axis=1)

#### 3.2.4 Dropping rows without verse identifier

In [None]:
verses_df.dropna(subset=["bkv"], inplace=True)
verses_df.head()

### 3.3 Save data frames to CSV files


In [None]:
# sort by GA then by BKV
verses_df.sort_values(by=["ga", "bkv"], inplace=True)
# add unique integer verse_id, as the transcription (or metadata like encoding_version or edition_version) can change over time
verses_df["verse_id"] = range(1, len(verses_df) + 1)
# write to file
verses_df.to_csv("../data/verses.csv", index=False, index_label="index")
# verses_df.to_parquet("../data/verses.parquet", index=False)

In [None]:
manuscripts_df.sort_values(by="ga", inplace=True)
manuscripts_df.drop_duplicates(inplace=True)
manuscripts_df.to_csv("../data/manuscripts_tei.csv", index=False, index_label="index")
# verses_df.to_parquet("../data/manuscripts_tei.parquet", index=False)