In [1]:
import re
from pathlib import Path

In [2]:
def segment_letter(letter_path, letter_type):
    """
    Segments the letter content by splitting it at the 'Risk Factors' section (based on the letter type).

    Args:
    - letter_path (str): Path to the letter file.
    - letter_type (str): The type of the letter, either 'mca' or 'oh'.

    Returns:
        str: The segmented content of the letter up to the 'Risk Factors' section.
    """
    # Read the content of the letter
    with open(letter_path, 'r', encoding='utf-8') as letter:
        letter_content = letter.read()

    # Define the regex patterns to identify the 'Risk Factors' section based on the letter type
    if letter_type == 'mca':
        risk_re = r'\n+\s*[34]?\.?\s*[Rr]isk [Ff]actors\.?\s*\n+'
    elif letter_type == 'oh':
        risk_re = r'\n+\s*[345]?\.\s*[Rr]isk(?: [Aa]nd [Pp]rotective)? [Ff]actors:?\s*\n+'
    
    # Split the letter content at the first occurrence of the 'Risk Factors' section
    letter_content_split = re.split(risk_re, letter_content, 1)[0]

    # Replace multiple consecutive newlines with a single newline
    letter_content_split_newline = re.sub(r'\n+', r'\n', letter_content_split)
    # Replace multiple consecutive whitespaces with a single whitespace
    letter_content_split_space = re.sub(r' +', ' ', letter_content_split_newline)

    return letter_content_split_space

In [3]:
def run_segmentation(folder_path_pseudon, folder_path_segment, letter_type):
    """
    Processes all pseudonymised letters, segments the content, and writes the segmented content.

    Args:
        folder_path_pseudon (str): Path to the directory containing input letters (pseudonymised letters).
        folder_path_segment (str): Path to the directory where segmented letters will be saved.
        letter_type (str): The type of the letters ('mcadl' or 'ohdl').
    """
    # Get a list of all letter paths in the pseudonymised letter directory
    letter_paths = list(Path(folder_path_pseudon).glob('*.txt'))

    # Process each letter and save the segmented content and the path
    for letter_path in letter_paths:
        letter_content_segment = segment_letter(letter_path, letter_type)
        letter_path_segment = Path(folder_path_segment) / letter_path.name

        # Write the segmented content to a new file in the output directory
        with open(letter_path_segment, 'w', encoding='utf-8') as letter_segment:
            letter_segment.write(letter_content_segment)

In [4]:
# Run the segmentation process for 'mcadl' letters
run_segmentation('data/primary_data/letters/pseudon_dls/mcadl/', 'data/primary_data/letters/segmented_dls/mcadl/', 'mca')
# Run the segmentation process for 'ohdl' letters
run_segmentation('data/primary_data/letters/pseudon_dls/ohdl/', 'data/primary_data/letters/segmented_dls/ohdl/', 'oh')