Remove personal names with flair-ger-large

In [7]:
from os import listdir
from os.path import isfile, join
import re

# give a folder location for input text files
input_folder="5213/pdf/txt/"
txt_files = [f for f in listdir(input_folder) if isfile(join(input_folder, f)) and ("txt" in f)]
txt_files.sort()
print(str(txt_files))
print(len(txt_files))

['22_5213_1.txt', '22_5213_2.txt', '22_5213_3.txt', '22_5213_4.txt', '22_5213_5.txt', '22_5213_6.txt']
6


In [8]:
!pip install flair



In [11]:
#this is minimum pre-requisites
!pip install langchain
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1


In [4]:
# using model from huggingface: https://huggingface.co/flair/ner-german-large
import pandas as pd
from flair.data import Sentence
from flair.models import SequenceTagger

# Load the Flair NER model for German
tagger = SequenceTagger.load("flair/ner-german-large")

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2025-02-19 15:45:36,483 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, B-PER, E-PER, S-LOC, B-MISC, I-MISC, E-MISC, S-PER, B-ORG, E-ORG, S-ORG, I-ORG, B-LOC, E-LOC, S-MISC, I-PER, I-LOC, <START>, <STOP>


In [5]:
# make example sentence
sentence = Sentence("George Washington ging nach Washington")

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

Sentence[5]: "George Washington ging nach Washington" → ["George Washington"/PER, "Washington"/LOC]
The following NER tags are found:
Span[0:2]: "George Washington" → PER (1.0000)
Span[4:5]: "Washington" → LOC (1.0000)


In [17]:
import os
# Function to extract locations from a given text
def extract_person_names(text):
    sentence = Sentence(text)
    tagger.predict(sentence)
    entities = sentence.get_spans('ner')
    names = []
    for entity in entities:
        if entity.get_label('ner').value == 'PER':
            names.append(entity.text)
            print("found name: " + entity.text )
    return names


In [18]:
# Function to read the content of a text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [19]:
extracted_names = []

for files in txt_files:
    print(f"Processing file: {files}")
    content = read_text_file(input_folder + files)
    names = extract_person_names(content)
    doc_number = os.path.splitext(files)[0]
    unique_names = list(set(names))  # Remove duplicates
    for name in unique_names:
        extracted_names.append({'doc_number': doc_number, 'name': name})
       

extracted_names

Processing file: 22_5213_1.txt
found name: juliane timmermann
Processing file: 22_5213_2.txt
Processing file: 22_5213_3.txt
Processing file: 22_5213_4.txt
found name: carl cohn
Processing file: 22_5213_5.txt
Processing file: 22_5213_6.txt


[{'doc_number': '22_5213_1', 'name': 'juliane timmermann'},
 {'doc_number': '22_5213_4', 'name': 'carl cohn'}]

In [21]:
# Create a DataFrame with the extracted names
names_df = pd.DataFrame(extracted_names)
names_df

Unnamed: 0,doc_number,name
0,22_5213_1,juliane timmermann
1,22_5213_4,carl cohn


In [22]:
def remove_names_from_text(input_file, output_file, names_df):
    """
    Remove names from input text file using a predefined DataFrame of names
    
    Parameters:
    input_file (str): Path to input text file
    output_file (str): Path to output text file
    names_df (pandas.DataFrame): DataFrame containing names to remove
    """
    # Convert names to list and make case-insensitive pattern
    names_list = names_df['name'].tolist()
    
    # Create regex pattern for whole word matching
    pattern = r'\b(' + '|'.join(map(re.escape, names_list)) + r')\b'
    
    try:
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Remove names (case insensitive)
        processed_text = re.sub(pattern, '[REDACTED]', text, flags=re.IGNORECASE)
        
        # Write to output file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(processed_text)
            
        print(f"Successfully processed {input_file} and saved to {output_file}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [24]:
# removing names from txt files
output_folder = "5213/pdf/out/"
for files in txt_files:
    print(f"Processing file: {files}")
    remove_names_from_text(input_folder+files, output_folder+files, names_df)

Processing file: 22_5213_1.txt
Successfully processed 5213/pdf/txt/22_5213_1.txt and saved to 5213/pdf/out/22_5213_1.txt
Processing file: 22_5213_2.txt
Successfully processed 5213/pdf/txt/22_5213_2.txt and saved to 5213/pdf/out/22_5213_2.txt
Processing file: 22_5213_3.txt
Successfully processed 5213/pdf/txt/22_5213_3.txt and saved to 5213/pdf/out/22_5213_3.txt
Processing file: 22_5213_4.txt
Successfully processed 5213/pdf/txt/22_5213_4.txt and saved to 5213/pdf/out/22_5213_4.txt
Processing file: 22_5213_5.txt
Successfully processed 5213/pdf/txt/22_5213_5.txt and saved to 5213/pdf/out/22_5213_5.txt
Processing file: 22_5213_6.txt
Successfully processed 5213/pdf/txt/22_5213_6.txt and saved to 5213/pdf/out/22_5213_6.txt
