# Customer Complaint Text Mining

This notebook demonstrates an end-to-end NLP pipeline for analyzing
customer complaint text data using supervised and unsupervised learning.

Note:
This notebook is designed to run using the sample dataset included
in the GitHub repository (data/complaints_sample.csv).

In [None]:
import os

csv_path = "../data/complaints_sample.csv"

if not os.path.exists(csv_path):
    raise FileNotFoundError(
        f"CSV file not found at {csv_path}. "
        "Make sure data/complaints_sample.csv exists."
    )

df = pd.read_csv(csv_path)
print("Dataset loaded:", df.shape)

In [None]:
#IMPORTS & SETUP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    silhouette_score
)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

print("All libraries loaded successfully.")


In [None]:
#LOAD DATA
df = pd.read_csv("../data/complaints_sample.csv")
print("Dataset loaded:", df.shape)
df.head()

In [None]:
#DATA CLEANING
keep_cols = [
    'Date received', 'Product', 'Issue',
    'Consumer complaint narrative', 'Company',
    'State', 'Submitted via', 'Date sent to company',
    'Company response to consumer', 'Timely response?', 'Complaint ID'
]

df = df[keep_cols]

df = df.drop_duplicates()
df = df.dropna(subset=['Consumer complaint narrative'])

print("Cleaned dataset shape:", df.shape)
df.head()

In [None]:
#Select Top 5 Products
top_5_products = [
    'Credit reporting or other personal consumer reports',
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Debt collection',
    'Checking or savings account',
    'Mortgage'
]

supervised_df = df[df['Product'].isin(top_5_products)].copy()
print("Filtered complaints:", len(supervised_df))

In [None]:
#Simplify Labels
product_mapping = {
    'Credit reporting or other personal consumer reports': 'Credit Reporting',
    'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Repair',
    'Debt collection': 'Debt Collection',
    'Checking or savings account': 'Bank Accounts',
    'Mortgage': 'Mortgage'
}

supervised_df['Product_simple'] = supervised_df['Product'].map(product_mapping)
supervised_df['Product_simple'].value_counts()

In [None]:
#BALANCE DATASET
sample_per_class = 2000

balanced_df = supervised_df.groupby(
    'Product_simple', group_keys=False
).apply(lambda x: x.sample(n=min(len(x), sample_per_class), random_state=42))

print("Balanced dataset size:", len(balanced_df))
balanced_df['Product_simple'].value_counts()

In [None]:
#TRAIN / TEST SPLIT (UNIFIED)
label_encoder = LabelEncoder()
balanced_df['label'] = label_encoder.fit_transform(balanced_df['Product_simple'])

train_texts, test_texts, train_labels, test_labels = train_test_split(
    balanced_df['Consumer complaint narrative'].tolist(),
    balanced_df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=balanced_df['label']
)

print("Train size:", len(train_texts))
print("Test size:", len(test_texts))

In [None]:
#FEATURE ENGINEERING
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,2)
)

bow_vectorizer = CountVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

X_train_bow = bow_vectorizer.fit_transform(train_texts)
X_test_bow = bow_vectorizer.transform(test_texts)

print("TF-IDF shape:", X_train_tfidf.shape)
print("BoW shape:", X_train_bow.shape)

In [None]:
#SUPERVISED MODELS (HORSE RACE)
supervised_results = {}

In [None]:
#Random Forest + BoW
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_bow, train_labels)

rf_pred = rf.predict(X_test_bow)
rf_acc = accuracy_score(test_labels, rf_pred)

supervised_results["Random Forest + BoW"] = rf_acc
rf_acc

In [None]:
#Logistic Regression + TF-IDF
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_tfidf, train_labels)

lr_pred = lr.predict(X_test_tfidf)
lr_acc = accuracy_score(test_labels, lr_pred)

supervised_results["Logistic Regression + TF-IDF"] = lr_acc
lr_acc

In [None]:
#Naive Bayes + TF-IDF
nb = MultinomialNB()
nb.fit(X_train_tfidf, train_labels)

nb_pred = nb.predict(X_test_tfidf)
nb_acc = accuracy_score(test_labels, nb_pred)

supervised_results["Naive Bayes + TF-IDF"] = nb_acc
nb_acc

In [None]:
#UNSUPERVISED LEARNING
#K-Means
unsupervised_sample = df.sample(5000, random_state=42)['Consumer complaint narrative'].tolist()

kmeans_vectorizer = TfidfVectorizer(
    max_features=3000,
    stop_words='english',
    min_df=2,
    max_df=0.95
)

X_kmeans = kmeans_vectorizer.fit_transform(unsupervised_sample)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_kmeans)

kmeans_score = silhouette_score(X_kmeans, kmeans_labels)
kmeans_score

In [None]:
#UNSUPERVISED LEARNING
#LDA
lda_vectorizer = CountVectorizer(
    max_features=3000,
    stop_words='english',
    min_df=2,
    max_df=0.95
)

X_lda = lda_vectorizer.fit_transform(unsupervised_sample)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_lda)

lda_score = 1 / lda.perplexity(X_lda)
lda_score

In [None]:
#MODEL COMPARISON PLOT

plt.figure(figsize=(8,5))
plt.bar(supervised_results.keys(), supervised_results.values(), color='skyblue')
plt.ylabel("Accuracy")
plt.title("Supervised Model Comparison")
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.show()

In [None]:
#CONFUSION MATRIX (BEST MODEL)

best_model = max(supervised_results, key=supervised_results.get)

best_preds = rf_pred if best_model == "Random Forest + BoW" else (
    lr_pred if best_model == "Logistic Regression + TF-IDF" else nb_pred
)

cm = confusion_matrix(test_labels, best_preds)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=label_encoder.classes_
)

disp.plot(cmap="Blues", xticks_rotation=45)
plt.title(f"Confusion Matrix - {best_model}")
plt.show()

In [None]:
#QUICK DEMO (LIVE)
def quick_demo():
    demo_texts = [
        "Bank charged overdraft fees without warning.",
        "Credit report error affecting my mortgage.",
        "Debt collector calling multiple times daily."
    ]

    for text in demo_texts:
        bow = bow_vectorizer.transform([text])
        tfidf = tfidf_vectorizer.transform([text])

        rf_cat = label_encoder.inverse_transform(rf.predict(bow))[0]
        lr_cat = label_encoder.inverse_transform(lr.predict(tfidf))[0]

        print("\nText:", text)
        print("RF:", rf_cat)
        print("LR:", lr_cat)

quick_demo()