In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#read the excel file with human decisions. Check for duplicate rows and drop those. Rewrite it into excel file
import pandas as pd
import numpy as np

In [None]:
INPUT_EXCEL_FILE = "/content/drive/MyDrive/worker_comp_work/WC_Final/filtered_extracted_data.xlsx"  # <<< CHANGE TO YOUR INPUT FILENAME

In [None]:
#get column names
df = pd.read_excel(INPUT_EXCEL_FILE)
df.columns

In [None]:
df_unique = df.drop_duplicates(subset=['Case ID'], keep='first')

In [None]:
#rewrite to an excel file
df_unique.to_excel("/content/drive/MyDrive/worker_comp_work/WC_Final/filtered_extracted_data_unique.xlsx", index=False)

In [None]:
#get number of rows
len(df_unique)
#should be 15406  records

In [None]:
#we plan to use only ''Case ID',''Issues', 'Findings of Fact', 'Order/Award', 'Decision' in our research.
# choose only those columns and create a data frame
df_research = df_unique[['Case ID', 'Issues', 'Findings of Fact', 'Order/Award', 'Decision']]
# Create two coulmns named "Annonymized_Facts" and 'Annonymized_Issues' for future research
df_research['Annonymized_Facts'] = ''
df_research['Annonymized_Issues'] = ''
#rewrite to an excel file for research
df_research.to_excel("/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready.xlsx", index=False)

In [None]:
df_research.columns

In [None]:
import spacy
import re # Import regex library for potential pattern-based additions

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
import spacy
import re # Import regex library for potential pattern-based additions
# --- Configuration ---
# Select the spaCy model
# 'en_core_web_sm' -> Small, fast, less accurate
# 'en_core_web_md' -> Medium
# 'en_core_web_lg' -> Large, slower, potentially more accurate
# 'en_core_web_trf' -> Transformer-based, potentially most accurate but slowest
NLP_MODEL_NAME = "en_core_web_trf"

# Define the types of entities to anonymize (based on spaCy's labels)
# Common PII labels:
# PERSON: People, including fictional.
# NORP: Nationalities or religious or political groups.
# FAC: Buildings, airports, highways, bridges, etc.
# ORG: Companies, agencies, institutions, etc.
# GPE: Countries, cities, states.
# LOC: Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT: Objects, vehicles, foods, etc. (Use with caution, can be broad)
# EVENT: Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART: Titles of books, songs, etc.
# LAW: Named documents made into laws.
# DATE: Absolute or relative dates or periods.
# TIME: Times smaller than a day.
# MONEY: Monetary values, including unit.
# QUANTITY: Measurements, as of weight or distance.
# ORDINAL: "first", "second", etc.
# CARDINAL: Numerals that do not fall under another type.

# **Choose carefully based on your needs!** Start specific.
PII_LABELS_TO_ANONYMIZE = {
    "PERSON",
    "NORP",
    "FAC",
    "GPE",      # Cities, States, Countries
    "LOC",      # Other locations (mountains, rivers)
    "ORG",      # Organizations, companies
    "EVENT",
    "WORK_OF_ART",
    "LAW",
    "DATE",
    "TIME",
    "MONEY",
    "QUANTITY",
    #"ORDINAL",
    "CARDINAL",
    "PHONE",    # Custom label for regex
    "EMAIL",    # Custom label for regex
    # Add more as needed, e.g., "FAC", "NORP", "MONEY"
}

# Define custom regex patterns for things spaCy might miss
# (Simple examples, enhance as needed)
REGEX_PATTERNS = {
    "EMAIL": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    "PHONE": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}" # Simple US-like phone format
    # Add patterns for IDs, addresses etc. if needed
}

# --- Load spaCy Model ---
try:
    nlp = spacy.load(NLP_MODEL_NAME)
    print(f"Loaded spaCy model '{NLP_MODEL_NAME}'")
except OSError:
    print(f"spaCy model '{NLP_MODEL_NAME}' not found.")
    print(f"Please run: python -m spacy download {NLP_MODEL_NAME}")
    nlp = None # Set nlp to None if loading fails

# --- Anonymization Function ---
def anonymize_text(text):
    if not nlp:
        print("spaCy model not loaded. Cannot perform NER-based anonymization.")
        return text # Or raise an error

    # --- Step 1: Apply Regex Replacements First ---
    # Create a list of replacements to apply later to avoid modifying
    # the string while iterating with regex. Store (start, end, placeholder).
    regex_replacements = []
    for label, pattern in REGEX_PATTERNS.items():
        if label in PII_LABELS_TO_ANONYMIZE:
            for match in re.finditer(pattern, text):
                regex_replacements.append((match.start(), match.end(), f"[{label}]"))

    # Sort regex replacements by start index
    regex_replacements.sort()

    # Apply regex replacements, adjusting for length changes if needed
    # (Simpler approach: build a new string like in spaCy part)
    processed_text = ""
    last_end = 0
    for start, end, placeholder in regex_replacements:
        # Avoid overlapping replacements if necessary (basic check)
        if start >= last_end:
            processed_text += text[last_end:start]
            processed_text += placeholder
            last_end = end
    processed_text += text[last_end:]
    text = processed_text # Update text for spaCy processing


    # --- Step 2: Apply spaCy NER Replacements ---
    doc = nlp(text)

    # Store identified entities to replace (start_char, end_char, label)
    entities_to_replace = []
    for ent in doc.ents:
        if ent.label_ in PII_LABELS_TO_ANONYMIZE:
            entities_to_replace.append((ent.start_char, ent.end_char, f"[{ent.label_}]"))

    # Sort entities by start position to process the text sequentially
    entities_to_replace.sort()

    # Build the anonymized string piece by piece
    anonymized_text = ""
    current_pos = 0
    for start, end, placeholder in entities_to_replace:
        # Add the text segment before the current entity
        # Important: Check for overlaps caused by regex or nested entities
        if start >= current_pos:
             anonymized_text += text[current_pos:start]
             # Add the placeholder
             anonymized_text += placeholder
             # Update the current position to the end of the replaced entity
             current_pos = end
        # If overlap (start < current_pos), means this entity was already part
        # of a replaced segment (e.g., regex replaced an email containing a name)
        # or nested entity. We skip adding it again.

    # Add any remaining text after the last entity
    anonymized_text += text[current_pos:]

    return anonymized_text

In [None]:
# Annonymizing Finding of Fact Column

import pandas as pd
import os

# --- Excel File & Column Configuration ---
INPUT_EXCEL_FILE = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready.xlsx"
OUTPUT_EXCEL_FILE = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready2.xlsx"
FACTS_COLUMN_NAME = "Findings of Fact"
ANONYMIZED_COLUMN_NAME = "Annonymized_Facts"

# --- Main Processing Logic ---
if 'nlp' in locals():  # Check if nlp is defined
    print(f"\nReading Excel file: {INPUT_EXCEL_FILE}")
    try:
        df = pd.read_excel(INPUT_EXCEL_FILE)
        print(f"Successfully read {len(df)} rows.")

        # Check if the input column exists
        if FACTS_COLUMN_NAME not in df.columns:
            print(f"Error: Column '{FACTS_COLUMN_NAME}' not found in the Excel file.")
            exit()

        # Apply the anonymization function to the 'Facts' column
        print(f"Anonymizing text in column '{FACTS_COLUMN_NAME}'...")

        def anonymize_and_print_row(row):
            if pd.notna(row[FACTS_COLUMN_NAME]):
                print(f"Anonymizing row: {row.name}")
                # Assuming anonymize_text is defined elsewhere
                return anonymize_text(str(row[FACTS_COLUMN_NAME]))
            else:
                return ""

        df[ANONYMIZED_COLUMN_NAME] = df.apply(anonymize_and_print_row, axis=1)

        print("Anonymization complete.")

        # Save the updated DataFrame to a new Excel file
        print(f"Saving results to: {OUTPUT_EXCEL_FILE}")
        try:
            df.to_excel(OUTPUT_EXCEL_FILE, index=False)
            print("Successfully saved anonymized data.")
        except Exception as e:
            print(f"Error saving results to Excel file: {e}")

    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_EXCEL_FILE}'")
    except Exception as e:
        print(f"An error occurred during Excel processing: {e}")
else:
    print("\nCannot proceed with Excel processing because spaCy model ('nlp') is not loaded or defined.")

print("\nScript finished.")

In [None]:
# Annonymizing Issues column

import pandas as pd
import os

# --- Excel File & Column Configuration ---
INPUT_EXCEL_FILE = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready2.xlsx"
OUTPUT_EXCEL_FILE = "/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready3.xlsx"
FACTS_COLUMN_NAME = "Issues"
ANONYMIZED_COLUMN_NAME = "Annonymized_Issues"

# --- Main Processing Logic ---
if 'nlp' in locals():  # Check if nlp is defined
    print(f"\nReading Excel file: {INPUT_EXCEL_FILE}")
    try:
        df = pd.read_excel(INPUT_EXCEL_FILE)
        print(f"Successfully read {len(df)} rows.")

        # Check if the input column exists
        if FACTS_COLUMN_NAME not in df.columns:
            print(f"Error: Column '{FACTS_COLUMN_NAME}' not found in the Excel file.")
            exit()

        # Apply the anonymization function to the 'Facts' column
        print(f"Anonymizing text in column '{FACTS_COLUMN_NAME}'...")

        def anonymize_and_print_row(row):
            if pd.notna(row[FACTS_COLUMN_NAME]):
                print(f"Anonymizing row: {row.name}")
                # Assuming anonymize_text is defined elsewhere
                return anonymize_text(str(row[FACTS_COLUMN_NAME]))
            else:
                return ""

        df[ANONYMIZED_COLUMN_NAME] = df.apply(anonymize_and_print_row, axis=1)

        print("Anonymization complete.")

        # Save the updated DataFrame to a new Excel file
        print(f"Saving results to: {OUTPUT_EXCEL_FILE}")
        try:
            df.to_excel(OUTPUT_EXCEL_FILE, index=False)
            print("Successfully saved anonymized data.")
        except Exception as e:
            print(f"Error saving results to Excel file: {e}")

    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_EXCEL_FILE}'")
    except Exception as e:
        print(f"An error occurred during Excel processing: {e}")
else:
    print("\nCannot proceed with Excel processing because spaCy model ('nlp') is not loaded or defined.")

print("\nScript finished.")