### Install and load necessary packages

In [5]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Download necessary NLTK data files

In [35]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...


True

### Load dataset then combine 'Issue' and 'Sub-Issue' columns into a single text field

In [25]:
complaints_df = pd.read_csv("C:/Users/charl/Documents/Complaint Analysis/ComplaintAnalysis-clean/rows.csv.zip",
                           low_memory=False)

complaints_df['combined_text'] = complaints_df[['Issue', 'Sub-issue']].astype(str).agg(' '.join, axis=1)

### Preprocess text

In [29]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text

complaints_df['cleaned_text'] = complaints_df['combined_text'].apply(preprocess_text)

### Synonym expansion using WordNet

In [31]:
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

### Define topic mapping using synonyms

In [32]:
topic_mapping = {
    'debt collection': ['debt collection', 'other debt', 'medical debt'],
    'credit reporting': ['credit reporting', 'credit repair services', 'personal consumer reports'],
    'communication tactics': ['communication tactics', 'talked to a third-party about your debt', 'used obscene, profane, or other abusive language', 'frequent or repeated calls'],
    'incorrect information': ['incorrect information', 'incorrect information on your report', 'information belongs to someone else'],
    'legal action': ['threatened to take negative or legal action', 'threatened to sue you for very old debt'],
    'notification issues': ['written notification about debt', 'didn\'t receive enough information to verify debt', 'didn\'t receive notice of right to dispute']
    
}

### Replace related words with topic keywords

In [33]:
def replace_with_topic_keywords(text, topic_mapping):
    for topic, related_words in topic_mapping.items():
        for word in related_words:
            text = re.sub(r'\b' + re.escape(word) + r'\b', topic, text)
    return text

complaints_df['mapped_text'] = complaints_df['cleaned_text'].apply(lambda x: replace_with_topic_keywords(x, topic_mapping))

### Tokenize and remove stopwords

In [36]:
stop_words = set(stopwords.words('english'))
complaints_df['tokens'] = complaints_df['mapped_text'].apply(word_tokenize)
complaints_df['filtered_tokens'] = complaints_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])