# Importing all required packages and exploring the dataset

In [1]:
import re
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import word_tokenize

df = pd.read_csv('/Users/c4741/Downloads/complaints.csv', skipfooter=1496058, engine='python')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\c4741\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### The analysed column: Issue

In [2]:
complaints = df['Issue']
print(complaints)

0                       Attempts to collect debt not owed
1                    Incorrect information on your report
2                    Incorrect information on your report
3                                   Communication tactics
4       Problem with a credit reporting company's inve...
                              ...                        
1996    Problem with a credit reporting company's inve...
1997                          Improper use of your report
1998                          Improper use of your report
1999    Problem with a credit reporting company's inve...
2000                 Incorrect information on your report
Name: Issue, Length: 2001, dtype: object


## Categories of Issues

In [3]:
df['Issue'].value_counts()

Incorrect information on your report                                                729
Problem with a credit reporting company's investigation into an existing problem    249
Attempts to collect debt not owed                                                   178
Improper use of your report                                                         101
Managing an account                                                                  89
                                                                                   ... 
Problem with customer service                                                         1
Identity theft protection or other monitoring services                                1
Identity theft / Fraud / Embezzlement                                                 1
Confusing or missing disclosures                                                      1
Loan modification,collection,foreclosure                                              1
Name: Issue, Length: 63, dtype: 

## Clean the complaints text / Pre-Processing

##### Step 1. All cases have been converted to low
##### Step 2. Each word from each row has been tokenized
##### Step 3. The English stop words have been removed
##### Step 4. The words have been stemmed
##### Step 5. The words have been lemmatized

In [4]:
df['Issue'] = df['Issue'].str.lower()

df['Issue'] = df['Issue'].apply(word_tokenize)

def stops_removal(text):
    t = [token for token in text if token not in stopwords.words("english")]
    text = ' '.join(t)
    return text

df['Issue'] = df['Issue'].apply(stops_removal)

df['Issue'] = df['Issue'].apply(word_tokenize)

stemmer = SnowballStemmer("english")
df['Issue'] = df['Issue'].apply(lambda x: [stemmer.stem(y) for y in x])

lmtzr = WordNetLemmatizer()
df['Issue'] = df['Issue'].apply(lambda lz:[lmtzr.lemmatize(z) for z in lz])

### The complaints are formatted with the purpose of creating the vocabulary

In [5]:
complaints = []
for row in df['Issue']:
    complaints.append(row)

res = [' '.join(ele) for ele in df['Issue']] #will be used later at BoW with Sklearn
complaints = ' '.join(res)
complaints = word_tokenize(complaints)


### The vocabulary (wordset) has been created [each clean word from complaints appears just one time in vocabulary]

In [6]:
vocabulary = []

for w in complaints:
    if w not in vocabulary:
        vocabulary.append(w)
print(vocabulary)

['attempt', 'collect', 'debt', 'owe', 'incorrect', 'inform', 'report', 'communic', 'tactic', 'problem', 'credit', 'compani', "'s", 'investig', 'exist', 'close', 'account', 'fraud', 'scam', 'appli', 'mortgag', 'refinanc', 'written', 'notif', 'end', 'loan', 'lea', 'alert', 'secur', 'freez', 'improp', 'use', 'troubl', 'payment', 'process', 'fals', 'statement', 'represent', 'took', 'threaten', 'take', 'negat', 'legal', 'action', 'deal', 'lender', 'servic', 'purchas', 'shown', 'manag', 'unabl', 'get', 'score', 'struggl', 'pay', 'featur', ',', 'term', 'transact', 'charg', 'fee', 'interest', "n't", 'expect', 'unexpect', 'repay', 'disclosur', 'verif', 'transfer', 'taking/threaten', 'illeg', 'advertis', 'market', 'includ', 'promot', 'offer', 'vehicl', 'damag', 'destroy', 'monitor', 'ident', 'theft', 'protect', 'open', 'caus', 'fund', 'low', 'make', 'issu', 'contact', 'someon', 'share', 'card', 'cont', "'d", 'line', 'limit', 'chang', 'mobil', 'wallet', 'shop', 'receiv', 'unauthor', 'custom', 'pa

### Creating the dictionary for Bag of words which counts how often a word appears in a complaint

In [7]:
def calculateBOW(vocabulary,complaint):
    tf_diz = dict.fromkeys(vocabulary,0)
    for word in complaint:
        tf_diz[word]=complaint.count(word)
    return tf_diz

# Bag of Words (BoW)

In [8]:
bows = []
for r in df['Issue']:
    b = calculateBOW(vocabulary, r)
    bows.append(b)
df_bow = pd.DataFrame(bows)
print (df_bow.head())

   attempt  collect  debt  owe  incorrect  inform  report  communic  tactic  \
0        1        1     1    1          0       0       0         0       0   
1        0        0     0    0          1       1       1         0       0   
2        0        0     0    0          1       1       1         0       0   
3        0        0     0    0          0       0       0         1       1   
4        0        0     0    0          0       0       1         0       0   

   problem  ...  receiv  unauthor  custom  payoff  /  embezzl  confus  miss  \
0        0  ...       0         0       0       0  0        0       0     0   
1        0  ...       0         0       0       0  0        0       0     0   
2        0  ...       0         0       0       0  0        0       0     0   
3        0  ...       0         0       0       0  0        0       0     0   
4        2  ...       0         0       0       0  0        0       0     0   

   modif  foreclosur  
0      0           0  
1   

### Creating the Bag of Words using sklearn:

In [9]:
vect = CountVectorizer()
data = vect.fit_transform(res)
data = pd.DataFrame(data.toarray(), columns=vect.get_feature_names_out())
print(data.head())

   account  action  advertis  alert  appli  attempt  card  caus  chang  charg  \
0        0       0         0      0      0        1     0     0      0      0   
1        0       0         0      0      0        0     0     0      0      0   
2        0       0         0      0      0        0     0     0      0      0   
3        0       0         0      0      0        0     0     0      0      0   
4        0       0         0      0      0        0     0     0      0      0   

   ...  transfer  troubl  unabl  unauthor  unexpect  use  vehicl  verif  \
0  ...         0       0      0         0         0    0       0      0   
1  ...         0       0      0         0         0    0       0      0   
2  ...         0       0      0         0         0    0       0      0   
3  ...         0       0      0         0         0    0       0      0   
4  ...         0       0      0         0         0    0       0      0   

   wallet  written  
0       0        0  
1       0        0  

# TF-IDF

In [10]:
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(res)
data_TF_IDF=pd.DataFrame(model.toarray(),columns=vectorizer.get_feature_names_out())
print(data_TF_IDF.head())

   account  action  advertis  alert  appli   attempt  card  caus  chang  \
0      0.0     0.0       0.0    0.0    0.0  0.512721   0.0   0.0    0.0   
1      0.0     0.0       0.0    0.0    0.0  0.000000   0.0   0.0    0.0   
2      0.0     0.0       0.0    0.0    0.0  0.000000   0.0   0.0    0.0   
3      0.0     0.0       0.0    0.0    0.0  0.000000   0.0   0.0    0.0   
4      0.0     0.0       0.0    0.0    0.0  0.000000   0.0   0.0    0.0   

   charg  ...  transfer  troubl  unabl  unauthor  unexpect  use  vehicl  \
0    0.0  ...       0.0     0.0    0.0       0.0       0.0  0.0     0.0   
1    0.0  ...       0.0     0.0    0.0       0.0       0.0  0.0     0.0   
2    0.0  ...       0.0     0.0    0.0       0.0       0.0  0.0     0.0   
3    0.0  ...       0.0     0.0    0.0       0.0       0.0  0.0     0.0   
4    0.0  ...       0.0     0.0    0.0       0.0       0.0  0.0     0.0   

   verif  wallet  written  
0    0.0     0.0      0.0  
1    0.0     0.0      0.0  
2    0.0     0

# LDA

In [11]:
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online',random_state=42,)
lda_top=lda_model.fit_transform(model)

print("Topics: ")
for i,topic in enumerate(lda_top[0]):
    print("Topic ",i,": ",topic*100,"%")  

Topics: 
Topic  0 :  6.671158166130228 %
Topic  1 :  6.67114775256597 %
Topic  2 :  6.671149884339562 %
Topic  3 :  73.30195980027976 %
Topic  4 :  6.68458439668447 %


## The most important words for each topic

In [12]:
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
payment troubl process mortgag pay struggl communic tactic make appli 

Topic 1: 
incorrect inform report fals represent secur freez alert fraud statement 

Topic 2: 
problem credit compani investig exist report close account get lender 

Topic 3: 
collect owe attempt debt purchas shown statement problem scam fraud 

Topic 4: 
use improp manag account written notif debt report loan threaten 



# LSA

In [13]:
LSA_model = TruncatedSVD(n_components=5, algorithm='randomized', n_iter=10)
lsa = LSA_model.fit_transform(model)
l=lsa[0]

print("Topics :")
for i,topic in enumerate(l):
    print("Topic ",i," : ",topic*100)

Topics :
Topic  0  :  5.831511928406042e-07
Topic  1  :  0.00026915060232286954
Topic  2  :  99.1368983581916
Topic  3  :  -0.0010461751501298026
Topic  4  :  -0.0002617803009388298


## The most important words for each topic

In [14]:
vocab = vect.get_feature_names_out()

for i, comp in enumerate(LSA_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
incorrect inform report problem credit investig compani exist improp use 

Topic 1: 
problem investig compani exist credit report purchas shown statement get 

Topic 2: 
debt attempt owe collect notif written cont verif disclosur foreclosur 

Topic 3: 
account manag close open incorrect inform loan charg lender lea 

Topic 4: 
use improp account report manag close troubl card open loan 

