In [1]:
import spacy
from tqdm import tqdm
import pandas as pd
import pickle

In [2]:
# Read the raw data
df = pd.read_pickle('./data/jenia_data/train/high_conf_pseudo_m3.pkl')

In [28]:
# First inspect the raw data, assess the relevant columns and create new dataframes including only the needed information

# For VUMC Notities (Notes)
# Create a 'text' column by converting the content of column 8 to string type
df['text'] = df[8].astype(str)
# We can create new columns with whatever we need from the raw data
# df['Note_ID'] = df[3].astype(str)

# Create a new DataFrame 'df_note_text' by dropping the columns that are not needed
# In this case, dropping columns 0 to 10 except for the newly created 'text' column

#df_note_text = df.drop([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=1)

# Print the column names of the DataFrame to inspect the remaining columns
print(df.columns)


In [21]:
# Write the extracted dataframe which includes only relevant columns (e.g. NoteID + text) into a pickle file to refer later
df_note_text.to_pickle('data1/2023_Notities.pkl')

In [32]:
df = df[['NoteID', 'text']] # Make sure we load only relevant columns (might be redundant if done already in previous step)

# Since preprocessing takes quite long for 800k or 1m notes, it would be efficient to divide the data into batches
# Here, for example it takes the first 200k notes to process only.
df_notes_first200 = df.iloc[0:200000].copy()

Preprocessing functions

In [48]:
# A function to process a row of DataFrame by splitting its text into sentences
def process_row(row):
    """
    Process a row from a DataFrame by splitting its text into sentences.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    list: A list of new rows, each containing an identifier and a sentence.
    """
    new_rows = []
    doc = nlp(row[8] if pd.notna(row[8]) else "")  # Process the text in column 8 if not NaN
    for i, sent in enumerate(doc.sents):
        identifier = f'{row[1]}_{i}'  # Create an identifier based on the row's second column and sentence index
        new_rows.append([identifier, sent.text])  # Append the identifier and sentence to the list
    return new_rows

# An alternative function to process a row of DataFrame by splitting its text into sentences
def process_row2(row):
    """
    Process a row from a DataFrame by splitting its text into sentences.

    Parameters:
    row (pd.Series): A row from the DataFrame.

    Returns:
    list: A list of new rows, each containing an identifier and a sentence.
    """
    new_rows = []
    doc = nlp(row['text'] if pd.notna(row['text']) else "")  # Process the text in 'text' column if not NaN
    for i, sent in enumerate(doc.sents):
        identifier = f"{row['NoteID']}_{i}"  # Create an identifier based on the 'NoteID' and sentence index
        new_rows.append((identifier, sent.text))  # Append the identifier and sentence to the list
    return new_rows

# The function to anonymize a text by replacing specific entities
def anonymize(txt, nlp):
    """
    Replace entities of type PERSON and GPE with 'PERSON' and 'GPE'.
    Return anonymized text and its length.

    Parameters:
    txt (str): The input text.
    nlp (Language): The spaCy language model.

    Returns:
    tuple: A tuple containing the anonymized text and its length.
    """
    doc = nlp(txt)  # Process the text with the spaCy language model
    anonym = str(doc)  # Convert the doc object to a string
    to_repl = {str(ent): ent.label_ for ent in doc.ents if ent.label_ in ['PERSON', 'GPE']}  # Identify entities to replace
    for string, replacement in to_repl.items():
        anonym = anonym.replace(string, replacement)  # Replace entities in the text
    return anonym, len(doc)  # Return the anonymized text and its length


In [34]:
# Load the Dutch language model from spaCy
nlp = spacy.load('nl_core_news_lg')

# Initialize tqdm to add a progress bar for pandas operations
tqdm.pandas(desc="Processing rows")

In [37]:
# Process each row in the DataFrame to split text into sentences and explode the resulting lists into separate rows, works with tqdm process bar
sentences_data_series = df_notes_first200.progress_apply(process_row2, axis=1).explode()

Processing rows: 100%|████████████████| 200000/200000 [1:56:15<00:00, 28.67it/s]


In [None]:
# Save the notes divided into sentences as series to refer later
sentences_data_series.to_pickle('./data1/sentences_data_series.pkl')

In [40]:
# Load the the segmented data
with open('./data1/sentences_data_series.pkl', 'rb') as file:
    sentences_data_series = pickle.load(file)

# Create a dataframe from the series data
sentences_df = pd.DataFrame(sentences_data_series.tolist(), columns=['NoteID', 'text'])

In [59]:
# Now we can divide the sentence segmented data into smaller batches in order to make it computationally efficient
# Here it takes the first 2m sentences
subset_df = sentences_df.iloc[0:2000000].copy()
# Make sure that there is no NaN cell
subset_df = subset_df.fillna('None')

print(subset_df.isna().any())

In [61]:
# Enable the progress bar for pandas apply method
tqdm.pandas(desc="Anonymizing text")

# Apply the anonymize function to the 'text' column and store the anonymized text in a new column 'Anonymized'
subset_df['Anonymized'] = subset_df['text'].progress_apply(lambda x: anonymize(x, nlp)[0])

In [62]:
# Save the segmented and anonymized batch into a pickle file to refer later
subset_df.to_pickle('./data1/2023_notities_third_3.5m_sentences.pkl')