In [None]:
# notebooks/eda.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from tqdm import tqdm
import os

tqdm.pandas()
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# === File Paths ===
raw_path = 'raw/complaints.csv'
filtered_path = 'filtered/filtered_complaints.csv'
figures_dir = 'reports/figures/'

In [None]:
# === Product Remapping ===
product_remap = {
    "Credit card": "Credit card",
    "Credit card or prepaid card": "Credit card",
    "Consumer Loan": "Personal loan",
    "Payday loan, title loan, personal loan, or advance loan": "Personal loan",
    "Payday loan, title loan, or personal loan": "Personal loan",
    "Checking or savings account": "Savings account",
    "Bank account or service": "Savings account",
    "Money transfer, virtual currency, or money service": "Money transfers",
    "Money transfers": "Money transfers",
    "Other financial service": "Buy Now, Pay Later"
}

target_products = [
    'Credit card',
    'Personal loan',
    'Buy Now, Pay Later',
    'Savings account',
    'Money transfers'
]

In [None]:
# === Ensure Directories Exist ===
def setup_directories():
    os.makedirs(os.path.dirname(filtered_path), exist_ok=True)
    os.makedirs(figures_dir, exist_ok=True)

In [None]:
# === Text Cleaning Function ===
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# === Process Data in Chunks ===
def process_chunks(path, chunksize=10000):
    df_chunks = pd.read_csv(path, chunksize=chunksize, low_memory=False)

    product_counts = []
    narrative_lengths = []
    filtered_data = []

    for chunk in tqdm(df_chunks, desc="Processing chunks"):
        # Normalize product names
        chunk['Product'] = chunk['Product'].replace(product_remap)

        # Filter for target products
        chunk = chunk[chunk['Product'].isin(target_products)]

        # Remove empty or invalid narratives
        chunk = chunk[chunk['Consumer complaint narrative'].notna() & 
                     (chunk['Consumer complaint narrative'] != '')]

        # Clean text
        chunk['cleaned_narrative'] = chunk['Consumer complaint narrative'].progress_apply(clean_text)

        # Ensure cleaned_narrative is valid
        chunk = chunk[chunk['cleaned_narrative'].str.strip().astype(bool)]

        # Collect data
        product_counts.append(chunk['Product'].value_counts())
        narrative_lengths.extend(chunk['cleaned_narrative'].apply(lambda x: len(x.split())))
        filtered_data.append(chunk)

    # Combine
    df_filtered = pd.concat(filtered_data, ignore_index=True)
    product_summary = pd.concat(product_counts, axis=0).groupby(level=0).sum()

    return df_filtered, product_summary, narrative_lengths

In [None]:
# === Plot and Save EDA Charts ===
def save_plots(df_filtered, product_counts, narrative_lengths):
    # 1. Complaint Distribution by Product
    plt.figure(figsize=(10, 6))
    sns.barplot(x=product_counts.values, y=product_counts.index)
    plt.title('Complaint Distribution by Product')
    plt.xlabel('Number of Complaints')
    plt.ylabel('Product')
    plt.savefig(os.path.join(figures_dir, 'product_distribution.png'))
    plt.close()

    # 2. Distribution of Narrative Lengths
    plt.figure(figsize=(10, 6))
    sns.histplot(narrative_lengths, bins=50)
    plt.title('Distribution of Narrative Lengths (Words)')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(figures_dir, 'narrative_length_distribution.png'))
    plt.close()

    # 3. Complaints Trend Over Time
    df_filtered['Date received'] = pd.to_datetime(df_filtered['Date received'])
    trend_data = df_filtered.groupby(df_filtered['Date received'].dt.to_period('M')).size().reset_index(name='Complaint Count')
    trend_data['Date received'] = trend_data['Date received'].dt.to_timestamp()
    plt.figure(figsize=(12, 6))
    plt.plot(trend_data['Date received'], trend_data['Complaint Count'], label='Monthly Complaints')
    plt.title('Complaints Trend Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Complaints')
    plt.xticks(rotation=45)
    plt.legend()
    plt.savefig(os.path.join(figures_dir, 'complaints_trend.png'))
    plt.close()

    # 4. Product vs Issue Type Heatmap
    pivot_table = df_filtered.pivot_table(values='cleaned_narrative', index='Product', columns='Issue', aggfunc='count', fill_value=0)
    plt.figure(figsize=(12, 8))
    sns.heatmap(pivot_table, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Heatmap of Product vs Issue Type')
    plt.xlabel('Issue Type')
    plt.ylabel('Product')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(figures_dir, 'product_issue_heatmap.png'))
    plt.close()

    # 5. Distribution of Complaints per Company
    company_counts = df_filtered['Company'].value_counts().head(10)
    plt.figure(figsize=(10, 6))
    company_counts.plot(kind='bar')
    plt.title('Top 10 Companies by Complaint Volume')
    plt.xlabel('Company')
    plt.ylabel('Number of Complaints')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(figures_dir, 'company_distribution.png'))
    plt.close()

In [None]:
# === Main Pipeline ===
def main():
    print("Setting up folders...")
    setup_directories()

    print("Processing data...")
    df_filtered, product_counts, narrative_lengths = process_chunks(raw_path)

    print(f"Saving cleaned dataset to {filtered_path}...")
    df_filtered.to_csv(filtered_path, index=False, compression='gzip')

    print("Saving EDA plots...")
    save_plots(df_filtered, product_counts, narrative_lengths)

    # Summary and type check
    print("\nSummary Stats for Interim Report:")
    print(f"Total complaints after filtering: {len(df_filtered):,}")
    print(f"Average narrative length: {sum(narrative_lengths) / len(narrative_lengths):.2f} words")
    print("Product distribution:")
    print(product_counts)
    print("Narrative type distribution:")
    print(df_filtered['cleaned_narrative'].map(type).value_counts())

In [None]:
# === Run ===
if __name__ == "__main__":
    main()