In [3]:
import glob
import os
import lxml.etree as ET
import pandas as pd

# Path to your TEI files (all years)
tei_path = "/Users/elisaonder/Desktop/ParlaMint-PT.TEI"

# Only session XML files (skip Schema folder)
files = glob.glob(os.path.join(tei_path, "20*/**/*.xml"), recursive=True)

print(f"Found {len(files)} XML files.")

# Prepare a list to store utterances
data = []

for f in files:
    # Skip schema files explicitly
    if "Schema" in f:
        continue

    try:
        tree = ET.parse(f)
        root = tree.getroot()

        # Extract session date
        date_el = root.find(".//{*}date")
        session_date = date_el.attrib.get("when") if date_el is not None else os.path.basename(f)[:10]
        year = session_date[:4]

        # Extract speakers if present
        speakers = {}
        for person in root.findall(".//{*}person"):
            pid = person.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
            name_el = person.find(".//{*}persName")
            name = name_el.text if name_el is not None else pid
            affil_el = person.find(".//{*}affiliation")
            party = affil_el.attrib.get("ref") if affil_el is not None else "Unknown"
            speakers[pid] = {"name": name, "party": party}

        # Extract utterances <u>
        for u in root.findall(".//{*}u"):
            speech_id = u.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
            speaker_id = u.attrib.get("who")
            if speaker_id:
                speaker_id = speaker_id.lstrip("#")
            else:
                speaker_id = "Unknown"

            # Use speaker info if exists, else fallback
            speaker_info = speakers.get(speaker_id, {"name": speaker_id, "party": "Unknown"})

            # Combine all <seg> text
            text = " ".join(seg.text for seg in u.findall(".//{*}seg") if seg.text)
            word_count = len(text.split())

            data.append([
                speech_id, session_date, year, speaker_id,
                speaker_info["name"], speaker_info["party"],
                word_count, text
            ])
    except Exception as e:
        print(f"⚠️ Error parsing {f}: {e}")

# Create a DataFrame
df = pd.DataFrame(data, columns=[
    "speech_id", "date", "year", "speaker_id",
    "speaker_name", "party", "word_count", "text"
])

# Display first 10 rows
print(df.head(10))

# Optionally, save to CSV
# df.to_csv("ParlaMint_PT_corpus.csv", index=False)


Found 901 XML files.
                     speech_id        date  year  \
0   ParlaMint-PT_2022-06-02.u1  2025-06-20  2025   
1   ParlaMint-PT_2022-06-02.u2  2025-06-20  2025   
2   ParlaMint-PT_2022-06-02.u3  2025-06-20  2025   
3   ParlaMint-PT_2022-06-02.u4  2025-06-20  2025   
4   ParlaMint-PT_2022-06-02.u5  2025-06-20  2025   
5   ParlaMint-PT_2022-06-02.u6  2025-06-20  2025   
6   ParlaMint-PT_2022-06-02.u7  2025-06-20  2025   
7   ParlaMint-PT_2022-06-02.u8  2025-06-20  2025   
8   ParlaMint-PT_2022-06-02.u9  2025-06-20  2025   
9  ParlaMint-PT_2022-06-02.u10  2025-06-20  2025   

                                speaker_id  \
0                AugustoErnestoSantosSilva   
1     PedroSaraivaGonçalvesdosSantosFrazão   
2             AntónioFilipeDiasMeloPeixoto   
3     PedroSaraivaGonçalvesdosSantosFrazão   
4  MariaAntóniaMorenoAreiasdeAlmeidaSantos   
5     PedroSaraivaGonçalvesdosSantosFrazão   
6                AugustoErnestoSantosSilva   
7                JoanaRitaMadalenoCord