In [5]:
# Import 
import os
import pandas as pd
import regex as re
from pathlib import Path
from collections import Counter
import csv

## Read in the sample text

In [1]:
# Read in the sample text, here: 'random_sample_d_prose_clean_text.txt' or 'Beispieltext.txt'
with open('random_sample_d_prose_clean_text.txt') as f:
    text = f.read()
    print(text[:200])

random_text
"Das Tanzlegendchen Nach der Aufzeichnung des heiligen Gregorius war Musa die Tänzerin unter den Heiligen. Guter Leute Kind, war sie ein anmutvolles Jungfräulein, welches der Mutter Gottes


In [2]:
# Print how many characters are in the sample text
text_length = len(text)
print(text_length)

8145122


## Preprocessing

In [8]:
# Text preparation steps to clean the input text data

single_spaces_text = re.sub('\n+', '\n', text)
single_spaces_text = re.sub('\s+', ' ', single_spaces_text)
single_spaces_text = re.sub('--+', ' -- ', single_spaces_text)
single_spaces_text = re.sub('&', 'und', single_spaces_text)
single_spaces_text = re.sub('<|/', ' ', single_spaces_text)
single_spaces_text = re.sub('>', ' ', single_spaces_text)
single_spaces_text = re.sub("'s", ' es', single_spaces_text)
single_spaces_text = re.sub(';', '.', single_spaces_text)
single_spaces_text = re.sub('\d', ' ', single_spaces_text)
text = re.sub('\s+', ' ', single_spaces_text)

print(text[:200])

random_text "Das Tanzlegendchen Nach der Aufzeichnung des heiligen Gregorius war Musa die Tänzerin unter den Heiligen. Guter Leute Kind, war sie ein anmutvolles Jungfräulein, welches der Mutter Gottes


## Define the Regular Expressions for the detection process

In [95]:
# Liste von Briefanfängen und -enden
letter_openings = [
    r'[»]?Mein[e]? [lL]iebe[rsn]?\s[A-Za-z]*[!]',
    r'[»]?Hochverehrte[rsn]?\s[A-Za-z]*[!]',
    r'[»]?Einziggeliebt[er]?\s[A-Za-z]*[.!,]?',
    r'[»]?Geehrte[rsn]?\s[A-Za-z]*[.!,]?',
    r'[»]?Sehr geehrte[srn]?\s[A-Za-z]*[.!,]?',
    r'[»]?Sehr verehrte[rs]?\s[A-Za-z]*[.!,]?',
    r'[»]?Grüss dich\s[A-Za-z]*[.!,]?',
    r'[»]?Grüß dich\s[A-Za-z]*[.!,]?',
    #r'[»]?\b(?!die\s+)Liebe\b\s[A-Za-z]*[.!,]?',
    #r'[»]?Liebe[sr]+\s[A-Za-z]*[.!,]?',
    r'»Liebe[sr]+\s[A-Z][a-z\s]*[A-Za-z!]*',
    r'»Liebste[sr]?\s[A-Za-z]*[A-Za-z!]*',
    r'[»]?Lieber Vater[!]',
    r'[»]?Liebster Vater[!]',
    r'[»]?Liebe Mutter[!]',
    r'[»]?Liebste Mutter[!]',
    r'[»]?Lieber Freund[!]',
    r'[»]?Geliebteste[r]?[!]',
    r'[»]?Geliebte[r][!]',
    r'[»]?Geliebte[!]',
    r'[»]?Einzig geliebteste[r]?[.!,]?',
    r'[»]?Einzig geliebte[r]?[.!,]?',
    r'[»]?Hochverehrter Herr[.!,]?',
    r'[»]?Hochverehrte Frau[.!,]?',
    r'[»]?Werte[rs]?\s[A-Za-z]*[!]',
    r'[»]?Mein geliebte[rs]?\s[A-Za-z]*[.!,]?',
    r'[»]?Teuerste[sr]?\s[A-Za-z]*[.!,]?',
    r'[»]?Teure[sr]?\s[A-Za-z]*[.!,]?',
    r'[»]?Mein Liebchen[.!,]?',
    r'(?:[A-ZÄÖÜa-zäöüß\s]+,\s*den\s+\d+\.\s+[A-ZÄÖÜa-zäöüß]+\s+\d{4}\.)',
    #r'\b\d{1,2}\. (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)(?: \d{2,4})?\b'
    r'(?:[A-ZÄÖÜa-zäöüß\s]+,\s*den\s+\d+\.\s+[A-ZÄÖÜa-zäöüß]+\s+\d{4}\.)|\b\d{1,2}\. (Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)(?: \d{2,4})?\b'
    # Ab hier weitere auszuschließende Fälle
]


## Do the matching and get the output as a dictionary

In [111]:
# Initialize an empty dictionary to store the matches, their start indices, and following 100 characters
extracted_matches = {}

# Iterate over each pattern
for pattern in letter_openings:
    # Initialize an empty list to store matches with start indices and following text for the current pattern
    pattern_matches = []
    
    # Iterate over matches for the current pattern and append them to the list
    for match in re.finditer(pattern, text):
        match_text = match.group(0)  # Get the matched text
        match_start = match.start()  # Get the start index of the match
        match_end = match.end()      # Get the end index of the match
        
        # Extract the following 100 characters
        following_text = text[match_end:match_end + 100]
        
        # Combine the match, its start index, and following 100 characters and append to the list
        combined_text = match_text + following_text
        pattern_matches.append((combined_text, match_start))
    
    # Store the list of matches with start indices and following text in the extracted_matches dictionary under the pattern name
    extracted_matches[pattern] = pattern_matches

# Print the extracted matches, their start indices, and following 100 characters
for pattern, pattern_matches in extracted_matches.items():
    for i, (combined_text, match_start) in enumerate(pattern_matches, start=1):
        #print(f"Pattern: '{pattern}', Match {i}")
        print(f"'{combined_text}'")
        print(f"Start Index: {match_start}")


'»Mein liebes Kind!« Der alte Junker trat näher an sie heran und dämpfte seine Stimme. »Du hast mir wahr und offenherzi'
Start Index: 780999
'Meine liebe Rose! Gestern sind wir zu Hause angekommen, nachdem wir eine wunderschöne kleine Reise gemacht haben. So '
Start Index: 4308528
'»Mein lieber Paolo! Sieh, der Herr hier will uns noch viel mehr Geld geben, so daß wir sicher bald heiraten können.« »J'
Start Index: 4680003
'»Hochverehrter Herr! Der ergebenst Unterzeichnete gibt sich die Ehre, Sie zu einer Besprechung einzuladen, betreffend ei'
Start Index: 1353269
'Hochverehrte Festversammlung! Nil admirari sagt jener berühmte Horatius, welchem wir auch das andere Wort verdanken, es ist schön'
Start Index: 2520217
'Hochverehrte Festversammlung! Ist es doch wahr, dieses Wort des lateinischen Dichters! Denn wohin wir auch blicken, immer wieder '
Start Index: 2520438
'Hochverehrte Festversammlung! Nil admirari! Welch ein Unterschied zwischen heute und gestern! Der Totgeglaubte steht gesund 

In [117]:
#print the dictionary with the extracted letter openings plus the start_index
#print(extracted_matches)

## Postprocessing

Here starts the postprocessing to clean the output from the most common false positives.

In [118]:

# Now look into the output and find the most common false positives.
# Collect them in this list of common false positives 
#that you can consequently use to delete the values from the received matches of letter openings to clean the output as a postprocessing step.

common_false_positives = [
    r'[»]?Lieber Gott',
    r'Liebe und',
    r'Liebe zu',
    r'Liebe von',
    r'[»]?Liebe[sr]+\s[A-Za-z,!]*[«]? sagte',
    r'Grüß dich Gott, [A-Za-z]',
    r'Teurer [A-Za-z,\s]* sagte',
    r'Hochverehrte Festversammlung!',
]

In [120]:
# Use this function to remove the most common false positives that you encounter and that you documented in the

def remove_common_false_positives(extracted_matches, common_false_positives):
    # Create a copy of the extracted_matches dictionary to avoid modifying the original
    cleaned_matches = {}

    # Iterate through the dictionary items
    for pattern, matches in extracted_matches.items():
        # Create a list to store valid matches that don't match common_false_positives
        valid_matches = []

        for match in matches:
            match_text, match_start = match

            # Check if the match_text matches any of the common_false_positives patterns
            if not any(re.search(fp_pattern, match_text) for fp_pattern in common_false_positives):
                valid_matches.append((match_text, match_start))

        # Only update the dictionary with valid matches
        if valid_matches:
            cleaned_matches[pattern] = valid_matches

    return cleaned_matches



# Remove common false positives
cleaned_matches = remove_common_false_positives(extracted_matches, common_false_positives)

# Print the cleaned_matches dictionary
#print(cleaned_matches)


## Save the dictionary to a csv file

In [115]:
#import csv

# Sample extracted_matches dictionary (replace this with your actual dictionary)
#extracted_matches = {
#    'Pattern1': [('Match1', 0), ('Match2', 15)],
#    'Pattern2': [('Match3', 5)]
#}

# Specify the CSV file path
csv_file_path = "cleaned_letter_openings_random_sample_text.csv"

# Write the extracted_matches dictionary to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    
    # Write a header row (optional)
    writer.writerow(['Pattern', 'Detected Text', 'Start Index'])
    
    # Iterate through the extracted_matches dictionary and write each match to the CSV file
    for pattern, matches in cleaned_matches.items():
        for match_text, match_start in matches:
            writer.writerow([pattern, match_text, match_start])

print(f"The dictionary has been saved to '{csv_file_path}'.")


The dictionary has been saved to 'cleaned_letter_openings_random_sample_text.csv'.


End of the Notebook.