# Preprocessing

The notebook handles the preprocessing of raw data for HMM models in multilingual medical text anonymized

- Loading and parsing raw JSON
- Language detection and cleaning
- Tokenization and feature extraction
- Creating observation sequences for HMM models
- Save results for posterior training and evaluatoin

## Setup

In [None]:
import os
import sys
import json
import re
import pickle
import numpy as np
from typing import Dict, List, Tuple, Set, Any, Optional
import spacy
from langdetect import detect, LangDetectException

PATH_ROOT = os.path.dirname(os.getcwd())  # Path to the root of the project
os.makedirs(os.path.join(PATH_ROOT, "data", "processed"), exist_ok=True)  # Directory output

try:  # Load spaCy models for Spanish and Catalan
    nlp_es = spacy.load("es_core_news_sm")
    nlp_ca = spacy.load("ca_core_news_sm")
    print("spaCy models loaded successfully\n")
except OSError:
    print("ERROR: You need to download spaCy models")
    print("python -m spacy download es_core_news_sm")
    print("python -m spacy download ca_core_news_sm")

# Add project root to system path
sys.path.append("..")

try:  # Import utility functions
    from utils.json import *
    from utils.language import *
except ImportError:
    print("Warning: Unable to import utility modules. Some functions may not work.")

# Load raw data
FILE_TRAIN = os.path.join(PATH_ROOT, "data", "raw", "train.json")
FILE_TEST = os.path.join(PATH_ROOT, "data", "raw", "test.json")

# Baseline files
FILE_TRAIN_BASELINE_PKL = os.path.join(PATH_ROOT, "data", "processed", "train_baseline.pkl")
FILE_TEST_BASELINE_PKL = os.path.join(PATH_ROOT, "data", "processed", "test_baseline.pkl")

# POS files
FILE_TRAIN_POS_PKL = os.path.join(PATH_ROOT, "data", "processed", "train_pos.pkl")
FILE_TEST_POS_PKL = os.path.join(PATH_ROOT, "data", "processed", "test_pos.pkl")

# BIO POS files
FILE_TRAIN_BIO_POS_PKL = os.path.join(PATH_ROOT, "data", "processed", "train_bio_pos.pkl")
FILE_TEST_BIO_POS_PKL = os.path.join(PATH_ROOT, "data", "processed", "test_bio_pos.pkl")

## Utility Functions

In [None]:
def clean_text_by_language(text: str, language: str, es_replacements: Dict[str, str] = None, ca_replacements: Dict[str, str] = None) -> str:
    """
    Applies language-specific cleaning to text while preserving critical patterns

    Args:
        text (str): The original text to clean
        language (str): Language code ('es' or 'ca')
        es_replacements: Optional custom Spanish replacements dictionary
        ca_replacements: Optional custom Catalan replacements dictionary

    Returns:
        str: Cleaned text with important patterns preserved
    """
    original_lower = text.lower()
    temp_cleaned = original_lower

    # Default Spanish replacements based on analysis
    default_es_replacements = {
        # Common medical abbreviations from the analysis
        "d.": "de",
        "dr.": "doctor",
        "dra.": "doctora",
        "sr.": "señor",
        "sra.": "señora",
        "t.a.": "tensión arterial",
        "hosp.": "hospital",
        "tto.": "tratamiento",
        "u.": "unidad",
        "h.": "hospital",
        "e.": "enfermedad",
        "c.": "centro",
        "n.": "número",
        "t.": "tiempo",
        "p.": "paciente",
        "b.": "bacteria",
        # Contractions
        "a el": "al",
        "d el": "del",
    }

    # Default Catalan replacements based on analysis
    default_ca_replacements = {
        # Common medical abbreviations
        "dr.": "doctor",
        "dra.": "doctora",
        "sr.": "senyor",
        "sra.": "senyora",
        "h.": "hospital",
        "t.": "temps",
        "c.": "centre",
        "u.": "unitat",
        "i.": "informe",
        "dia.": "dia",
    }

    # Use provided dictionaries or fallback to defaults
    es_replacements = es_replacements or default_es_replacements
    ca_replacements = ca_replacements or default_ca_replacements

    if language == "es":
        # Apply all Spanish replacements
        for orig, repl in es_replacements.items():
            temp_cleaned = re.sub(r"\b" + re.escape(orig) + r"\b", repl, temp_cleaned)
    elif language == "ca":
        # Handle Catalan apostrophes
        apostrophe_replacements = {"d'": "de ", "l'": "el ", "s'": "se ", "n'": "en ", "m'": "me ", "t'": "te "}

        for orig, repl in apostrophe_replacements.items():
            temp_cleaned = temp_cleaned.replace(orig, repl)

        # Apply all Catalan replacements
        for orig, repl in ca_replacements.items():
            if orig not in apostrophe_replacements:
                temp_cleaned = re.sub(r"\b" + re.escape(orig) + r"\b", repl, temp_cleaned)

    preserved_text = temp_cleaned  # Create a pattern registry to preserve important information
    replacements = {}
    placeholder_counter = 0

    patterns = [  # Patterns to preserve
        # Decimal numbers (both . and , as decimal separators)
        (r"\b\d+[\.,]\d+\b", "DECIMAL_"),
        # Dates in various formats
        (r"\b\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4}\b", "DATE_"),
        (r"\b\d{1,2}[\.-]\d{1,2}[\.-]\d{2,4}\b", "DATE_"),
        # Times
        (r"\b\d{1,2}:\d{1,2}(?::\d{1,2})?\b", "TIME_"),
        # Measurements with units (including temperature)
        (r"\b\d+(?:[\.,]\d+)?[\s-]*(?:mg|kg|g|ml|l|cm|mm|mmHg)\b", "MEASURE_"),
        (r"\b\d+(?:[\.,]\d+)?[\s-]*°[CF]\b", "TEMP_"),
        (r"\b\d+(?:[\.,]\d+)?[\s-]*ºC\b", "TEMP_"),
        # Percentages
        (r"\b\d+(?:[\.,]\d+)?[\s-]*%\b", "PERCENT_"),
        # Range expressions (common in dosing)
        (r"\b\d+-\d+\b", "RANGE_"),
        # Blood pressure values
        (r"\b\d+\/\d+\b", "BP_"),
        # Lab values with units
        (r"\b\d+(?:[\.,]\d+)?[\s-]*(?:g\/dl|mg\/dl|mmol\/l|µg|ng\/ml|ui\/l|u\/ml)\b", "LAB_"),
    ]

    # Replace each pattern with a placeholder
    for pattern, prefix in patterns:
        matches = re.finditer(pattern, preserved_text) # Find all matches for the pattern
        for match in matches:
            placeholder = f"{prefix}{placeholder_counter}" # Create a unique placeholder
            original = match.group(0) # Get the original text
            preserved_text = preserved_text.replace(original, placeholder, 1) # Replace the original text with the placeholder
            replacements[placeholder] = original
            placeholder_counter += 1

    cleaned = re.sub(r"[^\w\s-]", "", preserved_text).strip()  # Apply general punctuation removal but keep hyphens

    for placeholder, original in replacements.items():  # Restore the preserved patterns
        cleaned = cleaned.replace(placeholder, original)

    cleaned = re.sub(r"\s+", " ", cleaned)  # Remove extra whitespace

    return cleaned


In [None]:

def get_token_label(token_span: Dict, annotations: List[Dict]) -> str:
    """
    Determines the label for a token span based on character-level annotations

    Parameters:
        token_span (Dict): Token info with 'start' and 'end' character positions
        annotations (List[Dict]): List of annotation spans from the raw data

    Returns:
        str: Label
    """
    token_start = token_span["start"]
    token_end = token_span["end"]

    overlaps = [] # Store overlapping annotations with priority
    priority = {"UNC": 4, "USCO": 3, "NEG": 2, "NSCO": 1}

    for annotation in annotations:
        if "value" not in annotation or "labels" not in annotation["value"] or not annotation["value"]["labels"]:
            continue

        anno_start = annotation["value"]["start"]
        anno_end = annotation["value"]["end"]
        label = annotation["value"]["labels"][0]

        if label not in priority:  # Ignore irrelevant labels if any
            continue
        # Calculate overlap fraction relative to the token length
        overlap_start = max(token_start, anno_start) 
        overlap_end = min(token_end, anno_end)
        overlap_length = max(0, overlap_end - overlap_start)
        token_length = max(1, token_end - token_start)  # Avoid division by zero

        if overlap_length / token_length > 0.5: # Define overlap threshold (e.g., > 50% of token must overlap)
            overlaps.append({"label": label, "start": anno_start, "end": anno_end, "priority": priority[label]})

    if not overlaps:
        return "O" # No overlapping annotations

    best_overlap = max(overlaps, key=lambda x: x["priority"]) # Find the highest priority overlapping annotation
    best_label = best_overlap["label"]

    return best_label

## Preprocessing Functions

In [None]:
def prepare_data_for_hmm(data: List[Dict], include_pos: bool = False, use_bio: bool = False, use_context_window: bool = False, window_size: int = 1) -> Dict:
    """
    Processes raw data for HMM training with sentence-level language detection

    Parameters:
        data (List[Dict]): List of documents with annotations
        include_pos (bool): Whether to include POS tags in observations
        use_bio (bool): Whether to use BIO tagging scheme for states
        use_context_window (bool): Whether to include context window features
        window_size (int): Size of context window (each side) if used

    Returns:
        Dict: Processed data with sequences, vocabulary, state space, etc
    """

    sequences = []  # List to store detailed info per sequence
    observations = []  # List of observation sequences for HMM
    states = []  # List of state sequences for HMM
    
    vocab = set()  # Set of unique observations
    pos_tags = set()  # Set of unique POS tags encountered
    doc_languages = set()  # Track languages found at document level

    if use_bio: # Define state space based on whether BIO tagging is used
        state_space = {"O", "B-NEG", "I-NEG", "B-NSCO", "I-NSCO", "B-UNC", "I-UNC", "B-USCO", "I-USCO"}
    else:
        state_space = {"O", "NEG", "NSCO", "UNC", "USCO"}

    print(f"Starting data preparation with include_pos={include_pos}, use_bio={use_bio}, use_context_window={use_context_window}...")
    processed_docs = 0

    for i, document in enumerate(data):
        if "data" not in document or "text" not in document["data"]:
            print(f"Warning: Skipping document {i} due to missing 'data' or 'text'.")
            continue

        text = document["data"]["text"]
        if not text or text.isspace():
            print(f"Warning: Skipping document {i} due to empty text.")
            continue

        annotations = []  # Extract annotations for this document
        if "predictions" in document and document["predictions"]:
            for prediction in document["predictions"]:
                if "result" in prediction:
                    annotations.extend(prediction["result"])

        # Process the document
        doc_observations = []
        doc_states = []
        doc_tokens = []  # Detailed token info

        try:  # Segment the document into sentences (using default language model initially)
            initial_doc = nlp_es(text)  # Use Spanish model for initial segmentation
        except Exception as e:
            print(f"ERROR: Initial document segmentation failed for document {i}. Error: {e}")
            print(f"Text snippet: {text[:200]}")
            continue

        for sent in initial_doc.sents:  # Process each sentence
            sent_text = sent.text
            sent_start = sent.start_char

            if not sent_text or sent_text.isspace():  # Skip empty sentences
                continue

            sent_lang = detect_sentence_language(sent_text) # Detect language for THIS SPECIFIC SENTENCE
            doc_languages.add(sent_lang)

            nlp = nlp_ca if sent_lang == "ca" else nlp_es  # Select appropriate language model for this sentence

            try:  # Process the sentence with the correct language model
                spacy_sent = nlp(sent_text)
            except Exception as e:
                print(f"ERROR: Sentence processing failed. Language: {sent_lang}. Error: {e}")
                print(f"Sentence: {sent_text}")
                continue

            # Process each token in the sentence
            for token in spacy_sent:
                if token.is_space:
                    continue

                # Calculate absolute position in the document
                abs_token_start = sent_start + token.idx
                abs_token_end = abs_token_start + len(token.text)

                token_span = {"text": token.text, "start": abs_token_start, "end": abs_token_end}  # Create token span for annotation lookup

                label = get_token_label(token_span, annotations)  # TODO Bio Here in future

                cleaned_text = clean_text_by_language(token.text, sent_lang)  # Clean token text using the sentence language

                if not cleaned_text:  # Skip if cleaning results in empty string
                    continue

                token_info = {  # Token details
                    "text": token.text,
                    "cleaned_text": cleaned_text.lower(),
                    "pos": token.pos_,
                    "lemma": token.lemma_,
                    "start": abs_token_start,
                    "end": abs_token_end,
                    "label": label,
                    "language": sent_lang,
                }

                doc_tokens.append(token_info)

        for i, token_info in enumerate(doc_tokens):  # Create observation based on features
            observation = token_info["cleaned_text"] # Simple word only # TODO

            doc_observations.append(observation) # Add to sequences
            doc_states.append(token_info["label"])

            vocab.add(observation) # Add to vocabulary

        if doc_observations: # Add document to sequences if not empty
            observations.append(doc_observations)
            states.append(doc_states)
            sequences.append({"observations": doc_observations, "states": doc_states, "tokens": doc_tokens})

        processed_docs += 1
        if processed_docs % 100 == 0:
            print(f"  Processed {processed_docs}/{len(data)} documents...")

    print(f"Finished data preparation. Processed {processed_docs} documents.")
    print(f"Found {len(vocab)} unique observations in vocabulary.") 
    print(f"Document languages encountered: {doc_languages}")

    return {
        "sequences": sequences,
        "observations": observations,
        "states": states,
        "vocabulary": vocab,
        "state_space": state_space,
        "doc_languages": doc_languages,
    }


In [None]:

def save_processed_data(data: Dict, output_file: str) -> None:
    """
    Saves processed data to a pickle file
    """

    try:
        with open(output_file, "wb") as f:
            pickle.dump(data, f)
        print(f"Processed data saved successfully to {output_file}")
    except Exception as e:
        print(f"ERROR: Failed to save processed data to {output_file}. Error: {e}")


def load_processed_data(input_file: str) -> Dict:
    """
    Loads processed data from a pickle file
    """

    try:
        with open(input_file, "rb") as f:
            data = pickle.load(f)
        print(f"Processed data loaded successfully from {input_file}")
        return data
    except FileNotFoundError:
        print(f"ERROR: Processed data file not found at {input_file}")
        raise

## Process

In [None]:
print("Loading raw data...")
train_data = load_json_data(FILE_TRAIN)
test_data = load_json_data(FILE_TEST)
print(f"Loaded {len(train_data)} training documents")
print(f"Loaded {len(test_data)} test documents")


# Process Baseline Data (Without POS or BIO)
print("\nProcessing data for baseline model...")
train_baseline = prepare_data_for_hmm(train_data, include_pos=False, use_bio=False)
test_baseline = prepare_data_for_hmm(test_data, include_pos=False, use_bio=False)

# Save processed data
save_processed_data(train_baseline, FILE_TRAIN_BASELINE_PKL)
save_processed_data(test_baseline, FILE_TEST_BASELINE_PKL)