In [1]:
import re
from pathlib import Path
import spacy
from docloader import DocLoader

In [2]:
def pseudonymise_content(doc):
    """
    Extracts and replaces names from a doc's content using SpaCy's Named Entity Recognition (NER)
    and regular expressions for headers and signatures.

    Args:
    - doc: A SpaCy document object.

    Returns:
        str: The pseudonymised content.
    """
    # List of words to ignore during name extraction
    ignore_list = [
        's', 'prison', 'distribution', 'and', 'now', 'known', 'as', 'formerly',
        'aka', 'the', 'secretary', 'of', 'state', '#', '&', ',', '-', ' '
    ]

    # Convert full document text and tokens to lowercase for case-insensitive matching
    full_text_lower = doc.text.lower()
    tokens_lower = [token.lower_ for token in doc]

    # Extract names using SpaCy NER, remove unnecesssary punctuation, and filter out names in ignore_list
    ner_names = [token.text.lower() for token in doc if token.ent_type_ == 'PERSON']
    ner_names = [name.replace('(', '').replace(')', '').strip() for name in ner_names]
    ner_names_filtered = [name for name in ner_names if name not in ignore_list]

    # Extract names from headers using regular expressions, remove unnecesssary punctuation, and filter out names in ignore_list
    header_match = re.search(r'name:(.*)\n', full_text_lower)
    if header_match:
        header_text = header_match.group(1).replace('-', ' ').replace('/', ' ')
        header_names_filtered = [
            name.strip().replace(',', '').replace('(', '').replace(')', '') 
            for name in header_text.split() if name not in ignore_list
        ]

    # Extract names from signatures using regular expressions, remove unnecesssary punctuation, and filter out names in ignore_list
    sig_pattern_1 = re.search(r'parole board(?: member)?:(.*)\n', full_text_lower)
    sig_pattern_2 = re.search(r'parole board(?::)?(.*)(\s*)distribution', full_text_lower)
    
    if sig_pattern_1 or sig_pattern_2:
        sig_text = (sig_pattern_1 or sig_pattern_2).group(1).replace('-', ' ').replace('/', ' ')
        sig_names_filtered = [
            name.strip().replace(',', '').replace('(', '').replace(')', '') 
            for name in sig_text.split() if name not in ignore_list
        ]

    # Combine all extracted names (NER, headers, signatures) and remove duplicates by placing them into a set
    all_names = set(ner_names_filtered + header_names_filtered + sig_names_filtered)

    # Create an ordered list of names for replacememnt 
    ordered_names = []
    for token in tokens_lower:
        if token in all_names and token not in ordered_names:
            ordered_names.append(token)

    # Replace names with anonymous placeholders
    anonymised_tokens = []
    for token in doc:
        token_lower = token.text.lower()
        # Do not replace single character strings
        if token_lower in ordered_names and len(token.text) > 1:
            placeholder = 'X' + str(ordered_names.index(token_lower))
            anonymised_tokens.append(placeholder)
        else:
            anonymised_tokens.append(token.text)
        anonymised_tokens.append(token.whitespace_)

    # Reconstruct and return the anonymised text
    pseudonymised_text = ''.join(anonymised_tokens)
    
    return pseudonymised_text

In [3]:
def pseudonymise_letters(letter_paths, folder_path_pseudon, loader):
    """
    Pseudonymises the content per letter from a list of letter paths and saves the content in a pseudon folder.

    Args:
    - letter_paths (list): List of letter paths to be pseudonymised.
    - folder_path_pseudon (str): Path to the folder where pseudonymised letters will be saved.
    - loader (DocLoader): A DocLoader instance to load documents.
    """
    folder_pseudon = Path(folder_path_pseudon)

    for letter_path in letter_paths:
        letter_name = letter_path.name
        # Load the doc
        doc = loader.load_pseudon(letter_path)
        
        # Pseudonymise the doc
        letter_content_pseudon = pseudonymise_content(doc)
        
        # Save the pseudonymised content as a .txt file
        with open((folder_pseudon / letter_name).with_suffix('.txt'), 'w', encoding='utf-8') as letter_pseudon:
            letter_pseudon.write(letter_content_pseudon)

In [4]:
def run_pseudonymisation(loader, folder_path_pseudon):
    '''
    Runs the pseudonymisation process on letters that haven't been pseudonymised yet.

    Args:
    - loader (DocLoader): A DocLoader instance to load docs.
    - folder_path_pseudon (str): Path to the folder where pseudonymised letters will be stored.
    '''
    # Load all paths from the letter folder
    all_letter_paths = set(loader.all_letter_paths_pseudon())
    # Get all paths that have already been pseudonymised
    already_pseudonymised_letter_paths = set(Path(folder_path_pseudon).glob('*.txt'))
    
    # Only process new letters (those not already pseudonymised)
    letter_paths = all_letter_paths - already_pseudonymised_letter_paths
    
    # Perform the pseudonymisation process
    pseudonymise_letters(letter_paths, folder_path_pseudon, loader)

In [5]:
# Load the SpaCy model
nlp = spacy.load('en_core_web_trf')

In [6]:
# Define paths for letters, caches, and pseudonymised folders
folder_path_letters_mcadl = '../data/primary_data/letters/mcadl/original_dls/'
folder_path_letters_ohdl = '../data/primary_data/letters/ohdl/original_dls/'

folder_path_cache_mcadl = '../data/primary_data/letters/mcadl/caches/en_core_web_trf_pseudon'
folder_path_cache_ohdl = '../data/primary_data/letters/ohdl/caches/en_core_web_trf_pseudon'

folder_path_pseudon_mcadl = '../data/primary_data/letters/mcadl/pseudon_dls/'
folder_path_pseudon_ohdl = '../data/primary_data/letters/ohdl/pseudon_dls/'

In [7]:
# Initialize the DocLoader instances
loader_mcadl = DocLoader(nlp, folder_path_letters_mcadl, folder_path_cache_mcadl)
loader_ohdl = DocLoader(nlp, folder_path_letters_ohdl, folder_path_cache_ohdl)

In [8]:
# Run the pseudonymisation process on all new documents
run_pseudonymisation(loader_mcadl, folder_path_pseudon_mcadl)
run_pseudonymisation(loader_ohdl, folder_path_pseudon_ohdl)