In [None]:
# AI-Powered Customer Complaint Classification

### By:Dana Brooks

---

**Project Goal:** This notebook documents the end-to-end process of building a machine learning model to automatically classify customer mortgage complaints into one of 22 distinct categories.

**Process:** The project involves data cleaning, exploratory data analysis (EDA), natural language processing (NLP) with TF-IDF, and an iterative modeling process to find the most effective solution.

**Result:** The final **Logistic Regression model** successfully categorizes complaints with **52% accuracy**, a result that is over **11 times better than random chance**, demonstrating a strong and reliable baseline for this business problem.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)

In [None]:
df = pd.read_csv(r'upload dataset')

In [None]:
print("First 5 rows:")
print(df.head())


In [None]:
print("\nDataset shape:")
print(df.shape)


In [None]:
print("\nDataset info:")
df.info()


In [None]:
df_clean = df[['consumer_complaint_code', 'consumer_complaint_narrative']].copy()

In [None]:
df_clean.columns = ['issue', 'complaint']

In [None]:
print("New focused DataFrame:")
print(df_clean.head())

In [None]:
print("\nDistribution of complaints per issue:")

In [None]:
print(df_clean['issue'].value_counts().head(20))

In [None]:
plt.figure(figsize=(10, 8))
sns.countplot(y='issue', data=df_clean, order=df_clean['issue'].value_counts().index[:20])
plt.title('Top 20 Mortgage Complaint Issues')
plt.xlabel('Number of Complaints')
plt.ylabel('Issue Type')
plt.tight_layout()
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string 

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])

    # 3. Tokenize (split text into words)
    words = word_tokenize(text)

    # 4. Remove stop words and lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # 5. Join words back into a single string
    return " ".join(lemmatized_words)

print("Function defined successfully.")

In [None]:
# Apply the function to our 'complaint' column
print("Cleaning text data... this may take a moment.")
df_clean['cleaned_complaint'] = df_clean['complaint'].apply(preprocess_text)
print("Cleaning complete!")

# View the results
print(df_clean[['complaint', 'cleaned_complaint']].head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# First, get the counts of each issue
issue_counts = df_clean['issue'].value_counts()

# Identify the issues that have more than one complaint
issues_to_keep = issue_counts[issue_counts > 1].index

# Filter the DataFrame to only include the issues we want to keep
df_clean_filtered = df_clean[df_clean['issue'].isin(issues_to_keep)]

print(f"Original number of rows: {len(df_clean)}")
print(f"Number of rows after filtering rare categories: {len(df_clean_filtered)}")


# --- Now, proceed with the split using the FILTERED data ---
X = df_clean_filtered['cleaned_complaint']
y = df_clean_filtered['issue']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- Initialize and run the TF-IDF Vectorizer ---
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nTF-IDF vectors created successfully on filtered data.")
print(f"Shape of training vectors: {X_train_tfidf.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. Initialize the Model
# We use random_state for reproducibility
model = LogisticRegression(random_state=42)

# 2. Train the Model
print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("Training complete!")

# 3. Make Predictions on the Test Data
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test_tfidf)

# 4. Evaluate the Model's Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))