# PARLAMINT-PT Data Extraction and Preprocessing

In [11]:
import os
import zipfile
import shutil
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Path set up and unzip the TEI-XLM files
The files were ulpoaded in ZIP format as they were too heavy to upload otherwise.

In [12]:

drive_path = '/content/drive/MyDrive/Thesis/ParlaMint-PT.TEI.zip'
local_path = '/content'
extract_to_path = '/content/ParlaMint-PT'

shutil.copy(drive_path, local_path)
print("ParlaMint-PT zip in local runtime.")

with zipfile.ZipFile(f'{local_path}/ParlaMint-PT.TEI.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)
print("Unzipped ParlaMint-PT data")

root_path = Path(extract_to_path)
NS = {"tei": "http://www.tei-c.org/ns/1.0"}

ParlaMint-PT zip in local runtime.
Unzipped ParlaMint-PT data


## Data Preprocessing

In [13]:
# Funtion that parses the @ana annotation string to extract the speaker role and topic. For example: from this "#chair topic:other" will be role="chair", topic="other"
def parse_ana(ana: str):
    role, topic = None, None
    if ana:
        for token in ana.strip().split():
            if token.startswith("#"):
                role = token[1:]
            elif token.startswith("topic:"):
                topic = token.split(":", 1)[1]
    return role, topic

# Function that converts a concatenated speaker ID into a human-readable name when no structured personography data is available
def readable_name_from_who(who_id: str | None):
    if not who_id:
        return None
    name = re.sub(r"(?<!^)(?=[A-ZÁÀÂÃÉÊÍÓÔÕÚÜÇ])", " ", who_id)
    return re.sub(r"\s{2,}", " ", name).strip()

# Function that extracts the person or entity to whom the intervention is being agressed to (e.g., Presidente, Deputados) by looking at the first 160 words in the intervention as tipically the interventions include the adressee in the beginning
def guess_addressee(text: str | None):
    if not text:
        return None
    first = text[:160]
    patterns = [
        (r"\bSr\.?a?\.?\s+Presidente\b", "Presidente"),
        (r"\bSrs?\.?\s+Deputad[oa]s?\b", "Deputados"),
        (r"\bSr\.?a?\.?\s+Deputad[oa]\b", "Deputado/Deputada"),
        (r"\bSr\.?a?\.?\s+Secretári[oa]\b", "Secretário/Secretária"),
        (r"\bSr\.?a?\.?\s+Ministr[oa]\b", "Ministro/Ministra"),
    ]
    for pat, label in patterns:
        if re.search(pat, first, flags=re.IGNORECASE):
            return label
    return None


In [14]:
from lxml import etree

# Function that loads speaker metadata from a TEI listPerson XML file and returns a mapping from speaker ID to their name and political party
def load_person_map(root_path: Path):
    person_files = list(root_path.glob("**/*-listPerson.xml"))
    if not person_files:
        print(" No listPerson file found. Will infer names from speaker_id.")
        return {}
    # Parse the first listPerson file found
    pf = person_files[0]
    tree = etree.parse(str(pf))
    # Extract all <person> elements from the TEI listPerson
    persons = tree.xpath("//tei:listPerson/tei:person", namespaces=NS)

    mapping = {}
    for p in persons:
        # Extract the unique speaker identifier (xml:id)
        pid = p.get("{http://www.w3.org/XML/1998/namespace}id") or p.get("xml:id")
        if not pid:
            continue

        # Extract the full name of the person
        pers = p.find(".//tei:persName", namespaces=NS)
        name = " ".join(pers.itertext()).strip() if pers is not None else None

        # Extract the political party affiliation
        party = None
        for aff in p.xpath(".//tei:affiliation", namespaces=NS):
            if "party" in (aff.get("role") or "").lower():
                org = aff.find(".//tei:orgName", namespaces=NS)
                if org is not None:
                    party = " ".join(org.itertext()).strip()
            if not party and aff.get("ref"):
                party = aff.get("ref").split("/")[-1]
        # Store the extracted metadata for the speaker
        mapping[pid] = {"name": name, "party": party}
    return mapping

# Load the person metadata mapping once at startup
person_map = load_person_map(root_path)
# Print the number of successfully loaded persons
print(f" Persons loaded: {len(person_map)}")

 Persons loaded: 836


In [15]:
import re
from lxml import etree

# Funtion that parses a TEI XML parlamentary session file and returns a list of structured intervention with speaker, text, metadata, and inferred attributes.
def parse_file(file_path: Path, person_map: dict):
    rows = []
    try:
        tree = etree.parse(str(file_path))
    except Exception as e:
        print(f" Skipping {file_path.name}: parse error -> {e}")
        return rows

    doc = tree.getroot()
    # Extract the parlamentary session date from the TEI header,
    dates = doc.xpath("//tei:teiHeader//tei:settingDesc//tei:date/@when", namespaces=NS)
    session_date = dates[0] if dates else None

    # Iterate over each <u> element from each parlamentary sessionn
    for i, u in enumerate(doc.xpath("//tei:text//tei:u", namespaces=NS), start=1):

        # Extract speech and speaker identifiers
        speech_id = u.get("{http://www.w3.org/XML/1998/namespace}id") or u.get("xml:id")
        speaker_id = (u.get("who") or "").lstrip("#") or None

        # Extract role and topic annotations from the @ana attribute
        role, topic = parse_ana(u.get("ana") or "")

        # Extract the intervention text
        segs = u.xpath(".//tei:seg", namespaces=NS)
        if segs:
            parts = [" ".join(seg.itertext()).strip() for seg in segs]
            text = " ".join([p for p in parts if p])
        else:
            text = " ".join(u.itertext()).strip()

        # Normalize whitespace and compute text length
        text = re.sub(r"\s+", " ", text).strip()
        text_length = len(text.split()) if text else 0

        # Match speaker name and party using the person map when available
        if speaker_id and speaker_id in person_map:
            speaker_name = person_map[speaker_id].get("name") or readable_name_from_who(speaker_id)
            party = person_map[speaker_id].get("party")
        else:
            speaker_name = readable_name_from_who(speaker_id)
            party = None

        # Create the structured intervention record
        rows.append({
            "speech_id": speech_id,
            "date": session_date,
            "intervention_id": i,
            "speaker_id": speaker_id,
            "speaker_name": speaker_name,
            "party": party,
            "role": role,
            "text": text,
            "text_length": text_length,
            "topic": topic,
            "addressed_to": guess_addressee(text)
        })
    return rows

In [16]:
# Collect all TEI XML files, excluding 9 person metadata files
tei_files = [
    p for p in root_path.glob("**/*.xml")
    if "-listPerson.xml" not in p.name
]
print(f"Found {len(tei_files)} XML files to parse.")

# Aggregate all parsed interventions across files
all_rows = []
for fp in tqdm(tei_files, desc="Parsing TEI files"):
    # Parse each TEI file and extend the list with its interventions
    all_rows.extend(parse_file(fp, person_map))

# Print the total number of extracted interventions
print(f"Parsed {len(all_rows)} interventions.")


Found 910 XML files to parse.


Parsing TEI files:  25%|██▌       | 229/910 [00:10<00:24, 28.31it/s]

 Skipping ParlaMint-schemaSpecs.odd.xml: parse error -> ID ParlaMint-GB already defined, line 105, column 59 (ParlaMint-schemaSpecs.odd.xml, line 105)


Parsing TEI files: 100%|██████████| 910/910 [00:36<00:00, 25.25it/s]

Parsed 248577 interventions.





In [17]:
# Define the column order for the final DataFrame
cols = [
    "speech_id", "date", "intervention_id", "speaker_id",
    "speaker_name", "party", "role", "text", "text_length",
    "topic", "addressed_to"
]

# Create a DataFrame from the parsed interventions
df = pd.DataFrame(all_rows, columns=cols).sort_values(
    ["date", "speech_id", "intervention_id"], na_position="last"
).reset_index(drop=True)

# Visualize the top 3 lines of the resulting dataset and calculate its size again for confirmation
print("DataFrame ready:")
display(df.head(3))
print(f"Total rows: {len(df):,}")


DataFrame ready:


Unnamed: 0,speech_id,date,intervention_id,speaker_id,speaker_name,party,role,text,text_length,topic,addressed_to
0,ParlaMint-PT_2015-01-07.u1,2015-01-07,1,MariadaAssunçãoAndradeEsteves,Maria \n da \n Assunção \n ...,#PSD,chair,"Srs. Deputados, Srs. Jornalistas, Srs. Funcion...",82,other,Deputados
1,ParlaMint-PT_2015-01-07.u10,2015-01-07,10,JoãoGuilhermeRamosRosadeOliveira,João \n Guilherme \n Ramos \...,#PCP,regular,"Sr.ª Presidente, Sr.as e Srs. Deputados: Queri...",183,lawcr,Deputados
2,ParlaMint-PT_2015-01-07.u100,2015-01-07,100,MariadaAssunçãoAndradeEsteves,Maria \n da \n Assunção \n ...,#PSD,chair,"Para uma declaração política, tem, agora, a pa...",13,other,Deputados


Total rows: 248,577


## Sort the interventions chronologically and in correct order within-document

In [18]:
# Extract the intervention identifier (everything after "u#"") from the speech_id, for example, ParlaMint-PT_2015-01-07.u123 and intercention index= 123
df["u_num"] = df["speech_id"].str.extract(r"\.u(\d+)$").astype(int)

# Extract the document identifier (everything before "u#"") from the speech_id, so we sort within each parlamentary session file
df["speech_doc"] = df["speech_id"].str.replace(r"\.u\d+$", "", regex=True)

# Basic data cleaning: Normalize whitespace in speaker names and remove leading '#' from party identifiers
df["speaker_name"] = df["speaker_name"].apply(lambda s: " ".join(s.split()) if isinstance(s, str) else s)
df["party"] = df["party"].apply(lambda s: s.lstrip("#") if isinstance(s, str) else s)

# Sort interventions chronologically and in correct within-document order
df = df.sort_values(["date", "speech_doc", "u_num"]).reset_index(drop=True)

display(df.head(3))

Unnamed: 0,speech_id,date,intervention_id,speaker_id,speaker_name,party,role,text,text_length,topic,addressed_to,u_num,speech_doc
0,ParlaMint-PT_2015-01-07.u1,2015-01-07,1,MariadaAssunçãoAndradeEsteves,Maria da Assunção Andrade Esteves,PSD,chair,"Srs. Deputados, Srs. Jornalistas, Srs. Funcion...",82,other,Deputados,1,ParlaMint-PT_2015-01-07
1,ParlaMint-PT_2015-01-07.u2,2015-01-07,2,DuarteRogérioMatosVenturaPacheco,Duarte Rogério Matos Ventura Pacheco,PSD,regular,"Sr.ª Presidente, Srs. Deputados, deram entrada...",837,gover,Deputados,2,ParlaMint-PT_2015-01-07
2,ParlaMint-PT_2015-01-07.u3,2015-01-07,3,MariadaAssunçãoAndradeEsteves,Maria da Assunção Andrade Esteves,PSD,chair,"Muito obrigada, Sr. Deputado Duarte Pacheco. P...",33,other,Deputados,3,ParlaMint-PT_2015-01-07


## Save CSV file

In [20]:
# path to your Thesis folder
out_path = "/content/drive/MyDrive/Thesis/ParlaMint_PT_interventions.csv"

# save
df.to_csv(out_path, index=False)

print(f"CSV saved to: {out_path}")


CSV saved to: /content/drive/MyDrive/Thesis/ParlaMint_PT_interventions.csv
