In [1]:
import pandas as pd

In [4]:
import os
import sys
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

In [6]:
from src.preprocessing.cleaner import clean_complaints_df
from src.preprocessing.transformer import transform_features

In [8]:
# Load raw data
df_raw = pd.read_csv("../data/raw/consumer_complaints.csv", low_memory=False)

In [10]:
# Apply cleaning
df_cleaned = clean_complaints_df(df_raw)

In [11]:
# Apply feature transformation
df_transformed = transform_features(df_cleaned)

In [13]:
# Save cleaned and transformed data
df_transformed.to_csv("../data/processed/clean_consumer_complaints.csv", index=False)
print("Cleaned & processed data saved.")

Cleaned & processed data saved.


In [17]:
#Summary
retained_records = df_transformed.shape[0]
sample_text = df_transformed['text_cleaned'].iloc[0]

summary_path = "../outputs/profiling/preprocessing_summary.md"
dropped_columns = [
    "tags", "company_public_response", "consumer_consent_provided",
    "consumer_complaint_narrative", "complaint_id", "date_received", "date_sent_to_company",
    "submitted_via", "zipcode", "sub_issue", "sub_product", 
    "consumer_disputed?", "company_response_to_consumer"
]
with open(summary_path, "w") as f:
    f.write("### Preprocessing Summary\n\n")
    f.write(f"- Records with complaint narrative retained: {retained_records:,}\n")
    f.write(f"- Dropped columns: {', '.join(dropped_columns)}\n")
    f.write("- Cleaned text: Lowercased, special chars/HTML/URLs/emails removed, extra whitespace stripped\n")
    f.write("- Binary and encoded fields added: `consumer_disputed_binary`, `timely_response_binary`, `company_response_encoded`\n")
    f.write("- Text length feature added\n")
    f.write("- State nulls filled with 'Unknown'\n")
    f.write("\n---\n\n")
    f.write("####  Sample Cleaned Complaint\n")
    f.write(f"> {sample_text}\n")

print(f"Summary written to {summary_path}")

Summary written to ../outputs/profiling/preprocessing_summary.md
