# Text Preprocessing for Consumer Complaints

This notebook implements text cleaning and preprocessing steps for the consumer complaints data before vector embedding and analysis.

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import re
import string
from tqdm.notebook import tqdm

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Load the Raw Data

In [None]:
# Load the raw complaints data
data_path = '../data/raw/consumer_complaints.csv'
complaints_df = pd.read_csv(data_path)

# Display basic information
print(f'Dataset shape: {complaints_df.shape}')
print('\nColumns:', list(complaints_df.columns))
complaints_df.head()

## Text Cleaning Functions

In [None]:
def remove_boilerplate(text):
    """Remove common boilerplate and redacted information."""
    if not isinstance(text, str):
        return ""
    
    # Common patterns to remove
    patterns = [
        r'XX+',  # XXXX, XX/XX/XXXX, etc.
        r'\d{2}/\d{2}/\d{4}',  # Dates in XX/XX/XXXX format
        r'\b\d{10,}\b',  # Long numbers (likely account/SSN)
        r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b',  # Email addresses
        r'\bhttps?://\S+\b',  # URLs
        r'\b\d{3}-\d{2}-\d{4}\b',  # SSN pattern
        r'\b\d{4}[-\.]?\d{4}[-\.]?\d{4}[-\.]?\d{4}\b',  # Credit card numbers
        r'\[.*?\]',  # Anything in square brackets
        r'\(.*?\)',  # Anything in parentheses
        r'\b(?:redacted|omitted|removed|confidential|private|personal information|PII|PHI)\b',
        r'\b(?:name|address|phone|ssn|account|number|social security|credit card|debit card|bank account)\s*:.*?(?=\s+\w+:|$)'
    ]
    
    # Combine patterns and apply
    pattern = '|'.join(patterns)
    text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)
    
    return text.strip()

def remove_special_characters(text):
    """Remove special characters while preserving basic punctuation and alphanumeric characters."""
    if not isinstance(text, str):
        return ""
    
    # Keep alphanumeric, basic punctuation, and whitespace
    text = re.sub(r'[^\w\s.,!?-]', ' ', text)
    
    # Remove any remaining special characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    return text.strip()

def remove_extra_whitespace(text):
    """Normalize whitespace in text."""
    if not isinstance(text, str):
        return ""
    
    # Replace multiple whitespace with single space
    text = ' '.join(text.split())
    
    return text.strip()

def clean_text(text):
    """Main function to clean text using all cleaning functions."""
    if not isinstance(text, str):
        return ""
    
    # Apply cleaning steps in order
    text = text.strip()
    text = remove_boilerplate(text)
    text = remove_special_characters(text)
    text = remove_extra_whitespace(text)
    
    return text

## Apply Text Cleaning

In [None]:
# Select the text column to clean (adjust based on your data)
text_column = 'Consumer complaint narrative'  # Update this based on your column name

# Create a copy of the dataframe to avoid modifying the original
cleaned_df = complaints_df.copy()

# Apply cleaning to the text column
print("Cleaning text data...")
tqdm.pandas()
cleaned_df['cleaned_narrative'] = cleaned_df[text_column].progress_apply(clean_text)

# Remove rows with empty cleaned text
initial_count = len(cleaned_df)
cleaned_df = cleaned_df[cleaned_df['cleaned_narrative'].str.len() > 0].copy()
final_count = len(cleaned_df)

print(f"\nRemoved {initial_count - final_count} empty rows after cleaning.")
print(f"Final dataset size: {final_count} rows")

## Save Cleaned Data

In [None]:
# Create output directory if it doesn't exist
import os
os.makedirs('../data/processed', exist_ok=True)

# Save cleaned data
output_path = '../data/processed/cleaned_complaints.csv'
cleaned_df.to_csv(output_path, index=False)

print(f"Cleaned data saved to: {output_path}")

## Sample Before and After Cleaning

In [None]:
# Display sample of original and cleaned text
sample = cleaned_df.sample(5, random_state=42)

for idx, row in sample.iterrows():
    print("="*100)
    print("\nORIGINAL TEXT:")
    print(row[text_column])
    print("\nCLEANED TEXT:")
    print(row['cleaned_narrative'])
    print("\n" + "-"*50 + "\n")