In [1]:
import re
import spacy
import pandas as pd
from pathlib import Path
from docloader import DocLoader

In [2]:
def extract(doc_trf, doc_ner):
    """
    Extracts crime entities and letter contexts from a single letter using two NLP models:
    1. nlp_option_trf for sentence parsing.
    2. nlp_option_ner for crime-specific named entity recognition (NER).

    Args:
    - doc_trf (DocLoader): A trf spaCy document object.
    - doc_ner (DocLoader): A ner spaCy document object.

    Returns:
    - pd.DataFrame: A DataFrame containing the identified crimes, and sentence context.
    """
    final_rows = []  # Empty list for rows in the resulting pd.DataFrame to be appended to 
    sents_info = []  # Empty list for tuples of letter sentence information
    
    # Collect all sentences from the document
    for sent in doc_trf.sents:
        if sent.text != '-':  # Skip empty sentences or single dash
            # Append the sentence start index, end index, and text as a tuple to the all_sents list
            sents_info.append((sent.start_char, sent.end_char, sent.text))
    
    ents_info = set()  # Empty set to hold all entity info tuples

    # Extract entities labeled as 'CRIM' from the NER-processed document
    if doc_ner.ents:
        for ent in doc_ner.ents:
            if ent.label_ == 'CRIM':
                crime = ent.text
                crime_index = (ent.start_char, ent.end_char)
                ents_info.add((crime_index, crime))

        # Link crimes to their respective sentences
        offence_lookup = {}
        for i, sent_info in enumerate(sents_info):
            for ent_info in ents_info:
                # Check if crime entity falls within the sentence bounds
                if sent_info[0] <= ent_info[0][0] and sent_info[1] >= ent_info[0][1]:
                    if i in offence_lookup:
                        offence_lookup[i].append(ent_info)
                    else:
                        offence_lookup[i] = [ent_info]

        # Clean and format sentences for context extraction
        sents_cleaned = []
        for start, end, text in sents_info:
            sents_newline = re.sub(r'\n+', ' ', text).strip()  # Remove newlines and extra spaces
            sents_cleaned.append((start, end, sents_newline))

        # Extract the crime and its context (previous, current, and next sentence)
        for i, sent in enumerate(sents_cleaned):
            if i in offence_lookup:  # Sentence contains a crime
                context = []

                # Add previous sentence to context if it doesn't contain an offence
                if i > 0 and (i-1) not in offence_lookup:
                    context.append(sents_cleaned[i-1][2])

                # Add the current sentence
                context.append(sent[2])

                # Add next sentence to context if it doesn't contain an offence
                if i < len(sents_info) - 1 and (i+1) not in offence_lookup:
                    context.append(sents_cleaned[i+1][2])

                # Collect information for each crime in the sentence
                for match in offence_lookup[i]:
                    raw_index, raw_crime = match

                    final_rows.append({
                        'crime': raw_crime,  # Crime text
                        'context': ' '.join(context),  # Join all context sentences
                    })

    else:
        # If no crimes are found, append a row indicating "no crime" and "no context"
        final_rows.append({
            'crime': 'no crime',
            'context': 'no context',
        })
    
    # Convert the collected data to rows of a DataFrame and return
    letter_extract = pd.DataFrame(final_rows)
    return letter_extract

In [3]:
def run_extraction(loader_trf, loader_ner):
    """
    Runs the extraction process for all letters in a folder.

    Args:
    - loader_trf (DocLoader): A trf DocLoader instance to load documents.
    - loader_ner (DocLoader): A custon ner DocLoader instance to load documents. 

    Returns:
        pd.DataFrame: A combined DataFrame containing all letters' extracted crimes and contexts.
    """
    # Find all .txt files in the raw folder
    letter_paths = set(loader_ner.all_letter_paths_extract())
    
    # Create an empty DataFrame to hold results from all letters
    letters_extract = pd.DataFrame(columns=['letter_id', 'crime', 'context'])

    # Process each letter, run extraction, and concatenate the results
    for letter_path in letter_paths:
        doc_trf = loader_trf.load_extract(letter_path)
        doc_ner = loader_ner.load_extract(letter_path)
        letter_extract = extract(doc_trf, doc_ner)
        letter_extract['letter_id'] = letter_path.stem
        letters_extract = pd.concat([letters_extract, letter_extract])

    letters_extract[letters_extract['crime'] == 'no crime']

    return letters_extract

In [4]:
# Load the NLP models
nlp_trf = spacy.load('en_core_web_trf')
nlp_ner = spacy.load('../data/models/ner')



In [5]:
# Define paths for letters and caches
folder_path_segment_mcadl = '../data/primary_data/letters/mcadl/segmented_dls/'
folder_path_segment_ohdl = '../data/primary_data/letters/ohdl/segmented_dls/'

cache_path_trf_mcadl = '../data/primary_data/letters/mcadl/caches/en_core_web_trf_extract'
cache_path_trf_ohdl = '../data/primary_data/letters/ohdl/caches/en_core_web_trf_extract'

cache_path_ner_mcadl = '../data/primary_data/letters/mcadl/caches/ner_extract'
cache_path_ner_ohdl= '../data/primary_data/letters/ohdl/caches/ner_extract'

In [6]:
# Initialize the DocLoader instances
loader_trf_mcadl = DocLoader(nlp_trf, folder_path_segment_mcadl, cache_path_trf_mcadl)
loader_trf_ohdl = DocLoader(nlp_trf, folder_path_segment_ohdl, cache_path_trf_ohdl)

loader_ner_mcadl = DocLoader(nlp_ner, folder_path_segment_mcadl, cache_path_ner_mcadl)
loader_ner_ohdl = DocLoader(nlp_ner, folder_path_segment_ohdl, cache_path_ner_ohdl)

In [7]:
# Run the crime extraction process for MCADL and OHDL letters
extract_mcadl = run_extraction(loader_trf_mcadl, loader_ner_mcadl)
extract_ohdl = run_extraction(loader_trf_ohdl, loader_ner_ohdl)

In [8]:
# Save the extraction results 
extract_mcadl.to_excel('../data/primary_data/extract/mcadl/extract_mcadl.xlsx', index=False)
extract_ohdl.to_excel('../data/primary_data/extract/ohdl/extract_ohdl.xlsx', index=False)