In [None]:
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the dataset
file_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/PREPROCESSED_DATA_03.csv'
df = pd.read_csv(file_path)

In [None]:
# Check the initial number of rows
initial_rows = len(df)
print(initial_rows)

In [None]:
# Since columns E and F have no labels, we will rename them for clarity
df.rename(columns={df.columns[4]: 'Flag', df.columns[5]: 'MentionedName'}, inplace=True)

# Filter out the rows where the 'Flag' column has 'REMOVE' and 'MentionedName' is not NaN
to_anonymize = df[(df['Flag'] == 'REMOVE') & (~df['MentionedName'].isna())]

# Initialize a dictionary to keep track of the alphanumeric codes for each mentioned name
anonymize_dict = {}

# Function to anonymize the mentioned name in the text using regex
def anonymize_text(row):
    text = row['text']
    mentioned_name = row['MentionedName']
    if mentioned_name not in anonymize_dict:
        anonymize_dict[mentioned_name] = f'Anon{len(anonymize_dict) + 1}'
    anonymized_name = anonymize_dict[mentioned_name]
    # Use regex to replace the mentioned name
    anonymized_text = re.sub(re.escape(mentioned_name), anonymized_name, text)
    print(f"Replacing {mentioned_name} with {anonymized_name} in text: {text[:30]}... -> {anonymized_text[:30]}...")
    return anonymized_text

# Apply the anonymize_text function to the relevant rows
df.loc[(df['Flag'] == 'REMOVE') & (~df['MentionedName'].isna()), 'text'] = to_anonymize.apply(anonymize_text, axis=1)

# Show sample of anonymized comments
sample_size = 10
print("\nSample of anonymized comments:")
print(df[['text']].sample(sample_size))


In [None]:
# Save the processed dataframe back to a CSV
output_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/PREPROCESSED_DATA_04.csv'
df.to_csv(output_path, index=False)