# CFPB Complaints EDA and Preprocessing Notebook

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re


In [6]:
def load_data(file_path, chunksize=10000, max_rows=2000000):
    """Load up to 2000,000 rows from the CFPB complaint dataset"""
    chunk_list = []
    total_rows = 0

    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8', low_memory=False):
        rows_to_take = min(len(chunk), max_rows - total_rows)
        if rows_to_take <= 0:
            break
        chunk_list.append(chunk.iloc[:rows_to_take])
        total_rows += rows_to_take

    df = pd.concat(chunk_list, ignore_index=True)
    return df


In [8]:

def perform_eda(df):
    """Perform exploratory data analysis on the dataset"""
    print("Basic Data Information:")
    print(df.info())

    print("\nMissing Values Count:")
    print(df.isnull().sum())

    print("\nNumber of unique products:", df['Product'].nunique())

    # Distribution of complaints across products
    plt.figure(figsize=(12, 6))
    product_counts = df['Product'].value_counts()
    sns.barplot(x=product_counts.values, y=product_counts.index)
    plt.title('Distribution of Complaints Across Products')
    plt.xlabel('Number of Complaints')
    plt.ylabel('Product')
    plt.tight_layout()
    plt.savefig('../reports/product_distribution.png')
    plt.close()

    # Word count distribution of narratives
    df['word_count'] = df['Consumer complaint narrative'].str.split().str.len()

    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='word_count', bins=50)
    plt.title('Word Count Distribution of Complaint Narratives')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.xlim(0, 1000)
    plt.tight_layout()
    plt.savefig('../reports/word_count_distribution.png')
    plt.close()

    narratives_missing = df['Consumer complaint narrative'].isnull().sum()
    print(f"\nNumber of complaints without narratives: {narratives_missing} out of {len(df)}")
    
    return df


In [9]:

def clean_text(text):
    """Clean and preprocess text narratives"""
    if pd.isna(text):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [10]:

def preprocess_data(df):
    """Filter and preprocess the dataset according to project requirements"""
    target_products = [
        'Credit card',
        'Personal loan',
        'Buy Now Pay Later (BNPL)',
        'Savings account',
        'Money transfer'
    ]

    df = df[df['Product'].isin(target_products)]
    df = df[df['Consumer complaint narrative'].notnull() & (df['Consumer complaint narrative'] != "")]
    df['cleaned_narrative'] = df['Consumer complaint narrative'].apply(clean_text)
    df = df[df['cleaned_narrative'].str.split().str.len() > 10]
    
    return df


In [11]:

# Run the workflow
df = load_data('../data/complaints.csv')
df = perform_eda(df)
df_filtered = preprocess_data(df)
df_filtered.to_csv('../data/filtered_complaints.csv', index=False)

# Generate EDA summary
with open('../reports/eda_summary.txt', 'w') as f:
    f.write("Exploratory Data Analysis Summary\n")
    f.write("================================\n\n")
    f.write("Key Findings:\n")
    f.write("- The dataset contains complaints across multiple financial products.\n")
    f.write("- There are significant variations in complaint volumes across different products.\n")
    f.write("- Many complaints lack detailed narratives, which need to be filtered out.\n")
    f.write("- Narrative lengths vary widely, requiring appropriate chunking strategies.\n\n")
    f.write("Data Preparation Steps:\n")
    f.write("- Filtered dataset to include only the five specified product categories.\n")
    f.write("- Removed records with missing or empty complaint narratives.\n")
    f.write("- Cleaned text through lowercasing and removal of special characters.\n")
    f.write("- Filtered out very short narratives that would not provide meaningful context.\n")

print(f"\nFinal dataset shape: {df_filtered.shape}")
print("Filtered dataset saved to ../data/filtered_complaints.csv")


Basic Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 18 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
 6   Company public response       object
 7   Company                       object
 8   State                         object
 9   ZIP code                      object
 10  Tags                          object
 11  Consumer consent provided?    object
 12  Submitted via                 object
 13  Date sent to company          object
 14  Company response to consumer  object
 15  Timely response?              object
 16  Consumer disputed?            object
 17  Complaint ID                  int64 
dtypes: int64(1), objec