# Parsing TEI files

## 1 Installing needed packages

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!{sys.executable} -m pip install beautifulsoup4==4.12.3
!{sys.executable} -m pip install lxml==5.0.0
!{sys.executable} -m pip install pandas==2.1.4
!{sys.executable} -m pip install sparqlwrapper==2.0.0

from xml.sax.handler import ContentHandler
from xml.sax import make_parser
from pathlib import Path
import pandas as pd
import concurrent.futures
import time

from utils import (
    check_xml,
    get_data_from_tei,
    fix_bkv,
    generate_transcription_url,
)

from converters import bkv_to_nkv, ga_to_docID

## 2 Parsing

For parsing BeautifulSoup is used, as it is fast and reliable.
There are some dataclasses in use, which handle the objects 'Source', 'Verse' and 'Manuscript'. This is done to make the code more readable and easier to maintain. These classes are defined in `TEIFile.py`. 
The TEIFile class is used to work on/extract data from a given TEI file. The class is defined in `TEIFile.py`. 


## 3 Get data from TEI files

One goal is to extract all verses from all transcriptions (from NTVMR and IGNTP). This is done with the following code blocks.

### 3.1 Check files for validity

In [None]:
# Record the start time
tic = time.time()

# xml parser init
parser = make_parser()
parser.setContentHandler(ContentHandler())

# get files
raw_files = sorted(Path("../data/transcriptions/").rglob("*.xml"))
#raw_files = [Path("../data/transcriptions/igntp/ecm_1corinthians/NT_GRC_1_1Cor.xml")]

# Store paths to good files
well_formed_files = []

# Execute tasks and gather results
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Submit tasks and collect futures
    futures = [executor.submit(check_xml, file_path, parser) for file_path in raw_files]

    # Gather results
    for future in concurrent.futures.as_completed(futures):
        try:
            result = future.result()
            if result is not None:
                well_formed_files.append(result)  # Append each result to the list
        except Exception as e:
            print(f"Error: {e}")  # Handle exceptions here

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")

In [None]:
malformed_files = [x for x in raw_files if x not in well_formed_files]

print(malformed_files)

### 3.2 Get data from TEI file

#### 3.2.1 Data extraction from multiple TEI files in parallel

As there are many TEI files, it is necessary (for speed) to run the extraction of data in parallel. Use 'max_workers' to set number of cpu cores to be utilised.

In [None]:
# Record the start time
tic = time.time()

# Store results
manuscripts_list = []
verses_list = []

# Execute tasks and gather results
with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
    # Submit tasks and collect futures
    futures = [
        executor.submit(get_data_from_tei, file_path) for file_path in well_formed_files
    ]

    # Gather results
    for future in concurrent.futures.as_completed(futures):
        try:
            manuscript, verses = future.result()
            manuscripts_list.append(manuscript)  # Append each result to the list
            verses_list.append(verses)  # Append each result to the list
        except Exception as e:
            print(f"Error: {e}")  # Handle exceptions here

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")

# takes ~4min with 16 cores

#### 3.2.2 Lists to data frames

In [None]:
# flattening list of verses
flattened_verses_list = []
# Flattening the list of lists of dictionaries
for sublist in verses_list:
    flattened_verses_list.extend(sublist)

# list of dicts to data frame
verses_df = pd.DataFrame(flattened_verses_list)
manuscripts_df = pd.DataFrame(manuscripts_list)

#### 3.2.3 Apply cleanup and aggregation functions to dataframes

In [None]:
# TODO: Better check if nkv or bkv, set it to appropriate column, then generate the other column
# Apply the conversion function to the 'bkv' column in the DataFrame
verses_df["bkv"] = verses_df.apply(fix_bkv, axis=1)
verses_df["docID"] = verses_df.apply(ga_to_docID, axis=1)
verses_df["nkv"] = verses_df.apply(bkv_to_nkv, axis=1)
verses_df["ntvmrLink"] = verses_df.apply(generate_transcription_url, axis=1)

In [None]:
manuscripts_df["docID"] = manuscripts_df.apply(ga_to_docID, axis=1)

#### 3.2.4 Grouping
currently we do not want any grouping, to keep data seperated by its sources (here igntp and ntvmr, later also dbpedia)

In [None]:
# manuscripts_df["docID"] = manuscripts_df["docID"].astype(int)
# Setting the type for "int" has to be done as otherwise docID could be of different type (e.g. "10001",10001,10001.0). With different types it can't be grouped.

# Merge rows by docID
# manuscripts_df = (
#    manuscripts_df.groupby("docID")
#    .agg(
#        {
#            "ga": lambda x: " ".join(set(x)),
#            "label": lambda x: " ".join(set(x)),
#            "source": lambda x: ",".join(set(x)),
#        }
#    )
#    .reset_index()
# )

#### 3.2.5 Dropping rows

Some rows in verses_df do not contain a bkv – marked with NONE by fix_bkv() – as they are an inscriptio or a subscriptio. We drop those rows. 

In [None]:
verses_df.dropna(subset=["bkv"], inplace=True)
verses_df.head()

### 3.3 Save data frames to CSV files


In [None]:
verses_df.sort_values(by=["docID", "bkv"], inplace=True)
# verses_df.drop_duplicates(subset=["docID", "bkv"], inplace=True)
verses_df.to_csv("../data/verses.csv", index=False, index_label="index")
# verses_df.to_json("../data/verses.json")

In [None]:
manuscripts_df.sort_values(by="docID", inplace=True)
manuscripts_df.drop_duplicates(inplace=True)
manuscripts_df.to_csv("../data/manuscripts_tei.csv", index=False, index_label="index")
# manuscripts_df.to_json("../data/manuscripts.json")