In [1]:
import pandas as pd

In [2]:
import os
import sys
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.preprocessing.cleaner import clean_complaints_df
from src.preprocessing.transformer import transform_features

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/deeasonsitaula/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/deeasonsitaula/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# Load raw data
df_raw = pd.read_csv("../data/raw/consumer_complaints.csv", low_memory=False)

In [5]:
# Apply cleaning
df_cleaned = clean_complaints_df(df_raw)

In [6]:
# Apply feature transformation
df_transformed = transform_features(df_cleaned)

In [7]:
# Save cleaned and transformed data
df_transformed.to_csv("../data/processed/clean_consumer_complaints.csv", index=False)
print("Cleaned & processed data saved.")

Cleaned & processed data saved.


In [8]:
#  Save Preprocessing Summary 

retained_records = df_transformed.shape[0]
sample_text = df_transformed['text_cleaned'].iloc[0]

summary_path = "../outputs/profiling/preprocessing_summary.md"
dropped_columns = [
    "tags", "company_public_response", "consumer_consent_provided",
    "consumer_complaint_narrative", "complaint_id", "date_received", "date_sent_to_company",
    "submitted_via", "zipcode", "sub_issue", "sub_product",
    "consumer_disputed?", "company_response_to_consumer"
]

with open(summary_path, "w") as f:
    f.write("##  Sprint 2: Deep Preprocessing Summary\n\n")
    f.write(f"**Records Retained:** {retained_records:,}\n\n")
    
    f.write("###  Major Preprocessing Steps:\n")
    f.write("- Dropped columns irrelevant for sentiment modeling: " + ", ".join(dropped_columns) + "\n")
    f.write("- Text cleaning applied: Lowercased, special characters/HTML/URLs/emails/numbers removed, normalized spacing.\n")
    f.write("- Smart Weak Sentiment Labeling using:\n")
    f.write("  - TextBlob polarity & subjectivity\n")
    f.write("  - Keyword-based checks (extreme negative, negative, neutral indicators)\n")
    f.write("  - POS tagging for emphasis and negation adjustments\n")
    f.write("- Added Encoded/Binary Columns:\n")
    f.write("  - `consumer_disputed_binary`\n")
    f.write("  - `timely_response_binary`\n")
    f.write("  - `sentiment_encoded` (0: Neutral, 1: Negative, 2: Extreme Negative)\n")
    f.write("- Added Features:\n")
    f.write("  - `text_length` (word count of cleaned text)\n")
    f.write("  - `product_dispute_rate` (target encoding with smoothing)\n")
    f.write("  - `company_dispute_rate` (target encoding with smoothing)\n")
    f.write("  - `sentiment_timely_interaction` (sentiment * timely response)\n")
    f.write("  - `company_timely_interaction` (company dispute rate * timely response)\n")
    f.write("- Missing states filled with 'Unknown'.\n")
    f.write("- Carefully preserved complaint narratives with sufficient information.\n")
    
    f.write("\n---\n\n")
    f.write("### 📋 Sample of a Cleaned Complaint Narrative:\n\n")
    f.write(f"> {sample_text}\n")

print(f"✅ Preprocessing summary written to {summary_path}")


Summary written to ../outputs/profiling/preprocessing_summary.md
