In [3]:
import pandas as pd
import os
import sys
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)
from src.preprocessing.cleaner import clean_complaints_df
from src.preprocessing.transformer import transform_features

In [4]:
# Load raw consumer complaints data
df_raw = pd.read_csv("../data/raw/consumer_complaints.csv", low_memory=False)
print(f"Loaded {len(df_raw)} raw records.")

Loaded 555957 raw records.


In [5]:
# Apply cleaning from cleaner.py
df_cleaned = clean_complaints_df(df_raw)
print(f"Retained {len(df_cleaned)} records after initial cleaning.")

Retained 66806 records after initial cleaning.


In [6]:
# Apply feature transformation from transformer.py
df_transformed = transform_features(df_cleaned)

In [7]:
# Display a sample of the final transformed DataFrame to verify new features
print("\n Sample of the transformed data with new features:")
display(df_transformed.head())

print("\nColumns in the final DataFrame:")
print(df_transformed.columns.tolist())


 Sample of the transformed data with new features:


Unnamed: 0,product,issue,company,state,timely_response,text_cleaned,consumer_disputed_binary,timely_response_binary,text_length,product_dispute_rate,company_dispute_rate,sentiment,sentiment_intensity,sentiment_encoded,sentiment_timely_interaction,company_timely_interaction
190126,Debt collection,Cont'd attempts collect debt not owed,"Diversified Consultants, Inc.",NY,Yes,xxxx has claimed i owe them for xxxx years des...,0,1,52,0.222881,0.181818,negative,0.0,1,1,0.181818
190135,Consumer Loan,Managing the loan or lease,M&T Bank Corporation,VA,Yes,due to inconsistencies in the amount owed that...,0,1,238,0.247961,0.223464,neutral,0.085,0,0,0.223464
190155,Mortgage,"Loan modification,collection,foreclosure",Wells Fargo & Company,CA,Yes,in xxxxxxxx my wages that i earned at my job d...,1,1,655,0.267176,0.279024,extreme_negative,-0.019294,2,2,0.279024
190207,Mortgage,"Loan servicing, payments, escrow account",JPMorgan Chase & Co.,CA,Yes,i have an open and current mortgage with chase...,1,1,74,0.267176,0.281903,neutral,0.0,0,0,0.281903
190208,Mortgage,Credit decision / Underwriting,Rushmore Loan Management Services LLC,CA,Yes,xxxx was submitted xxxxxxxx at the time i subm...,1,1,259,0.267176,0.365585,negative,-0.051136,1,1,0.365585



Columns in the final DataFrame:
['product', 'issue', 'company', 'state', 'timely_response', 'text_cleaned', 'consumer_disputed_binary', 'timely_response_binary', 'text_length', 'product_dispute_rate', 'company_dispute_rate', 'sentiment', 'sentiment_intensity', 'sentiment_encoded', 'sentiment_timely_interaction', 'company_timely_interaction']


In [8]:
# Save cleaned and transformed data
output_path = "../data/processed/clean_consumer_complaints.csv"
df_transformed.to_csv(output_path, index=False)
print(f"Cleaned & processed data saved to {output_path}.")

Cleaned & processed data saved to ../data/processed/clean_consumer_complaints.csv.


In [9]:
#  Save Preprocessing Summary 

retained_records = df_transformed.shape[0]
sample_text = df_transformed['text_cleaned'].iloc[0]

summary_path = "../outputs/profiling/preprocessing_summary.md"

with open(summary_path, "w") as f:
    f.write(" Deep Preprocessing Summary\n\n")
    f.write(f"**Records Retained:** {retained_records:,}\n\n")
    
    f.write("###  Major Preprocessing Steps:\n")
    f.write("- Dropped columns irrelevant for sentiment modeling")
    f.write("- Text cleaning applied: Lowercased, special characters/HTML/URLs/emails/numbers removed, normalized spacing.\n")
    f.write("- Smart Weak Sentiment Labeling using:\n")
    f.write("  - TextBlob polarity & subjectivity\n")
    f.write("  - Keyword-based checks (extreme negative, negative, neutral indicators)\n")
    f.write("  - POS tagging for emphasis and negation adjustments\n")
    f.write("- Added Encoded/Binary Columns:\n")
    f.write("  - `consumer_disputed_binary`\n")
    f.write("  - `timely_response_binary`\n")
    f.write("  - `sentiment_encoded` (0: Neutral, 1: Negative, 2: Extreme Negative)\n")
    f.write("- Added Features:\n")
    f.write("  - `sentiment_intensity`: Sentiment score from TextBlob.\n")
    f.write("  - `text_length` (word count of cleaned text)\n")
    f.write("  - `product_dispute_rate` (target encoding with smoothing)\n")
    f.write("  - `company_dispute_rate` (target encoding with smoothing)\n")
    f.write("  - `sentiment_timely_interaction` (sentiment * timely response)\n")
    f.write("  - `company_timely_interaction` (company dispute rate * timely response)\n")
    f.write("- Missing states filled with 'Unknown'.\n")
    f.write("- Carefully preserved complaint narratives with sufficient information.\n")
    
    f.write("\n---\n\n")
    f.write("Sample of a Cleaned Complaint Narrative:\n\n")
    f.write(f"> {sample_text}\n")

print(f"Preprocessing summary written to {summary_path}")


Preprocessing summary written to ../outputs/profiling/preprocessing_summary.md
