In [None]:
# Corpus_Processing_Pandas_Polars.ipynb

# Import necessary libraries
from pathlib import Path
import pandas as pd
import polars as pl
import xml.etree.ElementTree as ET

# -----------------------------
# Step 1: Define paths to your corpus folders
# Replace the below dummy paths with the actual folder paths where your XMI files are located
cdlk_path = r"/path/to/CDLK/learner_xmi"  # e.g., "/home/user/data/CDLK/learner_xmi"
klp1_path = r"/path/to/KLP1/learner_xmi"  # e.g., "/home/user/data/KLP1/learner_xmi"

# -----------------------------
# Step 2: Define functions to parse XMI files and extract basic info

def parse_xmi_basic_info(xmi_file):
    """
    Parses an XMI file to extract basic corpus statistics:
    - document text length
    - token count
    - sentence count
    - annotation count

    Parameters:
        xmi_file (Path): Path object to the XMI file

    Returns:
        dict: Parsed information
    """
    try:
        tree = ET.parse(xmi_file)
        root = tree.getroot()

        # Extract document text length from 'Sofa' element attribute 'sofaString'
        doc_text_length = None
        for sofa in root.findall(".//{*}Sofa"):
            text = sofa.attrib.get("sofaString")
            if text:
                doc_text_length = len(text)
                break

        # Count tokens, sentences, and annotations based on tag names
        token_count = sum(1 for elem in root.iter() if "Token" in elem.tag)
        sentence_count = sum(1 for elem in root.iter() if "Sentence" in elem.tag)
        annotation_count = sum(1 for elem in root.iter() if "Annotation" in elem.tag)

        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": doc_text_length,
            "token_count": token_count,
            "sentence_count": sentence_count,
            "annotation_count": annotation_count,
        }
    except ET.ParseError:
        # Return None if parsing fails
        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": None,
            "token_count": None,
            "sentence_count": None,
            "annotation_count": None,
        }

# -----------------------------
# Step 3: Define processing functions for Pandas and Polars

def process_corpus_folder_pandas(folder_path):
    """
    Process all XMI files in folder using pandas DataFrame.

    Parameters:
        folder_path (str): Path to corpus folder

    Returns:
        pandas.DataFrame
    """
    folder = Path(folder_path)
    files = list(folder.glob("*.xmi"))
    records = []
    for f in files:
        info = parse_xmi_basic_info(f)
        records.append(info)
    return pd.DataFrame(records)

def process_corpus_folder_polars(folder_path):
    """
    Process all XMI files in folder using polars DataFrame.

    Parameters:
        folder_path (str): Path to corpus folder

    Returns:
        polars.DataFrame
    """
    folder = Path(folder_path)
    files = list(folder.glob("*.xmi"))
    records = []
    for f in files:
        info = parse_xmi_basic_info(f)
        records.append(info)
    return pl.DataFrame(records)

# -----------------------------
# Step 4: Process corpora using both Pandas and Polars (for demonstration)

print("Processing CDLK corpus with Pandas...")
df_cdlk_pd = process_corpus_folder_pandas(cdlk_path)
print(df_cdlk_pd.head())

print("\nProcessing KLP1 corpus with Pandas...")
df_klp1_pd = process_corpus_folder_pandas(klp1_path)
print(df_klp1_pd.head())

print("\nProcessing CDLK corpus with Polars...")
df_cdlk_pl = process_corpus_folder_polars(cdlk_path)
print(df_cdlk_pl.head())

print("\nProcessing KLP1 corpus with Polars...")
df_klp1_pl = process_corpus_folder_polars(klp1_path)
print(df_klp1_pl.head())

# -----------------------------
# Step 5: Save processed data to CSV for later analysis or sharing

df_cdlk_pd.to_csv("xmi_basic_info_cdlk_pandas.csv", index=False)
df_klp1_pd.to_csv("xmi_basic_info_klp1_pandas.csv", index=False)

df_cdlk_pl.write_csv("xmi_basic_info_cdlk_polars.csv")
df_klp1_pl.write_csv("xmi_basic_info_klp1_polars.csv")

print("\nData saved as CSV files. You can use these CSVs for further analysis.")
