In [1]:
from pathlib import Path
import pandas as pd
import xml.etree.ElementTree as ET

# Path to CDLK corpus XMI files
cdlk_path = r"C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\CDLK\learner_xmi"

# Path to KLP1 corpus XMI files
klp1_path = r"C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\KLP1\learner_xmi"

def parse_xmi_basic_info_pandas(xmi_file):
    try:
        tree = ET.parse(xmi_file)
        root = tree.getroot()

        doc_text_length = None
        for sofa in root.findall(".//{*}Sofa"):
            text = sofa.attrib.get("sofaString")
            if text:
                doc_text_length = len(text)
                break

        token_count = sum(1 for elem in root.iter() if "Token" in elem.tag)
        sentence_count = sum(1 for elem in root.iter() if "Sentence" in elem.tag)
        annotation_count = sum(1 for elem in root.iter() if "Annotation" in elem.tag)

        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": doc_text_length,
            "token_count": token_count,
            "sentence_count": sentence_count,
            "annotation_count": annotation_count,
        }
    except ET.ParseError:
        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": None,
            "token_count": None,
            "sentence_count": None,
            "annotation_count": None,
        }

def process_corpus_folder_pandas(folder_path):
    folder = Path(folder_path)
    files = list(folder.glob("*.xmi"))
    records = []
    for f in files:
        info = parse_xmi_basic_info_pandas(f)
        records.append(info)
    return pd.DataFrame(records)

# Example usage:
df_cdlk = process_corpus_folder_pandas(cdlk_path)
df_klp1 = process_corpus_folder_pandas(klp1_path)

print("CDLK data sample:")
print(df_cdlk.head())

print("\nKLP1 data sample:")
print(df_klp1.head())

# Save to CSV locally if needed
df_cdlk.to_csv("xmi_basic_info_cdlk_pandas.csv", index=False)
df_klp1.to_csv("xmi_basic_info_klp1_pandas.csv", index=False)


CDLK data sample:
          filename                                           filepath  \
0  201006ZW005.xmi  C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus...   
1  201006ZW012.xmi  C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus...   
2  201006ZW019.xmi  C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus...   
3  201006ZW021.xmi  C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus...   
4  201006ZW022.xmi  C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus...   

   doc_text_length  token_count  sentence_count  annotation_count  
0             1106         2028             106                 0  
1             1327         2242             156                 0  
2              869         1554              84                 0  
3             1354         2306             110                 0  
4             1076         2046             124                 0  

KLP1 data sample:
     filename                                           filepath  \
0  3360_1.xmi  C:\Users\Vedang Deshmukh\Desktop\

In [2]:
from pathlib import Path
import polars as pl
import xml.etree.ElementTree as ET
import datetime

# Same paths as above
cdlk_path = r"C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\CDLK\learner_xmi"
klp1_path = r"C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\KLP1\learner_xmi"

def parse_xmi_basic_info_polars(xmi_file):
    try:
        tree = ET.parse(xmi_file)
        root = tree.getroot()

        doc_text_length = None
        for sofa in root.findall(".//{*}Sofa"):
            text = sofa.attrib.get("sofaString")
            if text:
                doc_text_length = len(text)
                break

        token_count = sum(1 for elem in root.iter() if "Token" in elem.tag)
        sentence_count = sum(1 for elem in root.iter() if "Sentence" in elem.tag)
        annotation_count = sum(1 for elem in root.iter() if "Annotation" in elem.tag)

        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": doc_text_length,
            "token_count": token_count,
            "sentence_count": sentence_count,
            "annotation_count": annotation_count,
        }
    except ET.ParseError:
        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": None,
            "token_count": None,
            "sentence_count": None,
            "annotation_count": None,
        }

def process_corpus_folder_polars(folder_path):
    folder = Path(folder_path)
    files = list(folder.glob("*.xmi"))
    records = []
    for f in files:
        info = parse_xmi_basic_info_polars(f)
        records.append(info)
    return pl.DataFrame(records)

# Example usage:
df_cdlk_pl = process_corpus_folder_polars(cdlk_path)
df_klp1_pl = process_corpus_folder_polars(klp1_path)

print("CDLK data sample (polars):")
print(df_cdlk_pl.head())

print("\nKLP1 data sample (polars):")
print(df_klp1_pl.head())

# Save to CSV locally if needed
df_cdlk_pl.write_csv("xmi_basic_info_cdlk_polars.csv")
df_klp1_pl.write_csv("xmi_basic_info_klp1_polars.csv")


CDLK data sample (polars):
shape: (5, 6)
┌────────────────┬────────────────┬────────────────┬─────────────┬────────────────┬────────────────┐
│ filename       ┆ filepath       ┆ doc_text_lengt ┆ token_count ┆ sentence_count ┆ annotation_cou │
│ ---            ┆ ---            ┆ h              ┆ ---         ┆ ---            ┆ nt             │
│ str            ┆ str            ┆ ---            ┆ i64         ┆ i64            ┆ ---            │
│                ┆                ┆ i64            ┆             ┆                ┆ i64            │
╞════════════════╪════════════════╪════════════════╪═════════════╪════════════════╪════════════════╡
│ 201006ZW005.xm ┆ C:\Users\Vedan ┆ 1106           ┆ 2028        ┆ 106            ┆ 0              │
│ i              ┆ g Deshmukh\Des ┆                ┆             ┆                ┆                │
│                ┆ kt…            ┆                ┆             ┆                ┆                │
│ 201006ZW012.xm ┆ C:\Users\Vedan ┆ 1327          

In [4]:
from pathlib import Path
import xml.etree.ElementTree as ET
import polars as pl

def parse_xmi_basic_info(xmi_file: Path):
    try:
        tree = ET.parse(xmi_file)
        root = tree.getroot()

        # Extract document text length from Sofa element attribute "sofaString"
        doc_text_length = None
        for sofa in root.findall(".//{*}Sofa"):
            text = sofa.attrib.get("sofaString")
            if text:
                doc_text_length = len(text)
                break

        # Count tokens, sentences, annotations
        token_count = sum(1 for elem in root.iter() if "Token" in elem.tag)
        sentence_count = sum(1 for elem in root.iter() if "Sentence" in elem.tag)
        annotation_count = sum(1 for elem in root.iter() if "Annotation" in elem.tag)

        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": doc_text_length,
            "token_count": token_count,
            "sentence_count": sentence_count,
            "annotation_count": annotation_count,
        }
    except ET.ParseError as e:
        print(f"Error parsing {xmi_file}: {e}")
        return {
            "filename": xmi_file.name,
            "filepath": str(xmi_file),
            "doc_text_length": None,
            "token_count": None,
            "sentence_count": None,
            "annotation_count": None,
        }

def process_xmi_folder_polars(xmi_folder: str):
    folder = Path(xmi_folder)
    xmi_files = sorted(folder.glob("*.xmi"))
    print(f"Processing {len(xmi_files)} XMI files from {xmi_folder}...")
    records = []
    for f in xmi_files:
        info = parse_xmi_basic_info(f)
        records.append(info)
    df = pl.DataFrame(records)
    return df

# KLP1 path
xmi_path_klp1 = r"C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\KLP1\learner_xmi"

# Process and save CSV
df_info_klp1 = process_xmi_folder_polars(xmi_path_klp1)
df_info_klp1.write_csv("xmi_basic_info_klp1.csv")


Processing 523 XMI files from C:\Users\Vedang Deshmukh\Desktop\dakoda-corpus-recipes\data\KLP1\learner_xmi...


In [5]:
df_info_klp1.write_csv("xmi_basic_info_klp1.csv")
