In [None]:
import spacy #3.5.0
import json
import re
from os import listdir

### Redacting sensitive info in texts of court rulings
This notebook helps identify potentially sensitive and/or personal information in the texts of court rulings. We use a pretrained spacy model for Russian (see https://spacy.io/models/ru#ru_core_news_lg) to recognise named entities automatically. For full addresses, phone numbers, IP addresses, passport numbers, and emails, we use regular expressions (regex).

0. Before redacting, we converted all files of court rulings into txt.
1. Then, in each txt file, we identified spans with sensitive info with spacy and regex, and put all these spans into a json file.
2. We reviewed the resulting json file manually to minimise mistakes.
3. We redacted sensitive spans in all txt files by replacing them with an ellipsis '[...]'.
4. Lastly, we reviewed the redacted txt files checking whether there is no other potentially sensitive/personal info in them. However, we cannot guarantee that we redacted all potentially sensitive/personal info.

This notebook can be reused to redact sensitive/personal info in Russian court rulings.

In [None]:
# spacy.cli.download("ru_core_news_lg")

In [None]:
nlp_ru = spacy.load("ru_core_news_lg")

In [None]:
# a directtory with files to anonymise
path_to_files = ""

In [None]:
def get_redact_info_single_file(files_directory:str, filename:str, spacy_model) -> set:
    '''
    Redacting sensitive info (names, physical addresses, phones, emails, passport numbers, IP addresses)
    in a single txt file;
    To identify names, we use spacy model "ru_core_news_lg"; for other info, we use regex
    Takes 1 txt file as an input
    files_directory: str, path to txt files incl. "/"
    filename: str, name of txt file incl .txt
    spacy_model: loaded spacy model
    Returns a set of str to redact in a file
    '''
    to_redact_list = []
    
    with open(f"{files_directory}/{filename}", 'r', encoding='utf-8') as file:
        case = file.read()
        
    process = spacy_model(case)

    # this pattern includes words that are often mistakenly identified as personal names by spacy 
    pattern = re.compile(r"(Свидетел\w*\s*(N|№)?\d*)|(Потерпевш\w*\s*(N|№)?\d*)|(ФИО\s*\d*)|(Лицо\s*\d*)", re.IGNORECASE)

    # redacting names
    
    cleared_names = [
        name.split('\n')[0] if '\n' in name else name
        for name in set(ent.text for ent in process.ents if ent.label_ == 'PER' and len(ent.text) > 5)
    ]

    for name in cleared_names:
        if not pattern.search(name):
            to_redact_list.append(name)
        else:
            # do not redact already redacted names (such as 'ФИО5')
            new_n = pattern.sub('', name)
            if len(new_n) > 5:
                to_redact_list.append(new_n)
            
    # regex for physical addresses, phones, emails, passport numbers, IP addresses
    address_pattern = re.compile(r"(?:улиц(?:\.|\w)|ул\.?)\s+[А-Яа-яЁё\s\-]+,?\s*(?:дом|д\.?)\s*\d+\s*[А-Яа-яЁё]?(?:/\d+)?\s*,?\s*(?:(?:корпус|корп\.?|кор\.?)\s*\d+\s*,?\s*)?(?:(?:строение|стр\.?)\s*\d+\s*,?\s*)?(?:(?:квартира|кв\.?)\s*\d+\s*)?")
    phone_pattern = re.compile(r"(\+?[78][\s(]*\d{3}[\s)]*\d{3}[-\s]*\d{2}[-\s]*\d{2})")
    email_pattern = re.compile(r"(\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\\b)")
    passport_pattern = re.compile(r"(\\b\d{2}\s\d{2}\s\d{6}\\b)")
    ip_pattern = re.compile(r"(\\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\\b)")

    to_redact_list.extend(set(address_pattern.findall(case)))
    to_redact_list.extend(set(phone_pattern.findall(case)))
    to_redact_list.extend(set(email_pattern.findall(case)))
    to_redact_list.extend(set(passport_pattern.findall(case)))
    to_redact_list.extend(set(ip_pattern.findall(case)))
        
    return set(to_redact_list)

In [None]:
def redact_info(path_to_redact_info:str, path_to_cases:str, path_to_save='') -> str:
    '''
    Redacting info in txt files
    path_to_redact_info: str, path to a json file containing potentially sensitive info to redact
    path_to_cases: str, path to a directory with txt files to redact
    path_to_save: str, path to a directory where to save redacted files, default is ''; redacted txt files have the suffix "_redacted";
    Returns a str
    '''

    ellipsis = '[...]'
    # reading info to redact
    with open(path_to_redact_info,'r') as jf:
        to_redact = json.load(jf)

    for filename, redact_list in to_redact.items():
        
        # opening a file to redact
        with open(f"{path_to_cases}/{filename}", 'r', encoding='utf-8') as file:
            case = file.read()

        if len(redact_list) > 0:
            
            # iterating through str to redact
            pattern = re.compile('|'.join(re.escape(redact_span) for redact_span in redact_list))
            case_redacted = pattern.sub(ellipsis, case)
            case_spaces = re.sub(r'\u00A0', ' ', case_redacted)

            # saving redacted file
            with open(f"{path_to_save}/{filename.replace('.txt','')}_redacted.txt",'w') as txt_file:
                txt_file.writelines(case_spaces)
                
        else:
            with open(f"{path_to_save}/{filename.replace('.txt','')}_redacted.txt",'w') as txt_file:
                txt_file.writelines(case)
        
    return f"Redacted files are saved in {path_to_save}"

### The redaction process

In [None]:
# (0) getting all .txt files to redact (we grouped txt files by year)
txts = [doc for doc in listdir(path_to_files) if doc.endswith('.txt')]

In [None]:
# (1) making a dictionary with the info to redact in all .txt files in a directory
redact_dict = {}
for t in txts:
    # {'file_name':['a list of spans to be redacted']}
    redact_dict[t] = list(get_redact_info_single_file(path_to_files,t,nlp_ru))

with open(f"{path_to_files}/to_redact_info.json", 'w') as jf:
    json.dump(redact_dict,jf,ensure_ascii=False,indent=4)

In [None]:
# (2) review 'to_redact_info.json' manually

In [None]:
# (3) Redacting info in .txt files
# path_to_redact_info – path to 'to_redact_info.json'
# path_to_save – where to save redacted cases
redact_info(path_to_redact_info,path_to_files,path_to_save)

In [None]:
# (4) review redacted .txt files manually