In [27]:
import nltk 
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

For this notebook we will preprocess the data to prepare it for text analytics. The data we will be using was prepared in DataExplore.ipynb and is only data where we have the text of the customer complaint.

In [28]:
data = pd.read_csv('../../student-loan-complaints-data/text_analysis_data.csv')

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26854 entries, 0 to 26853
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Date received                 26854 non-null  object
 1   Sub-product                   26854 non-null  object
 2   Issue                         26854 non-null  object
 3   Sub-issue                     26854 non-null  object
 4   Consumer complaint narrative  26854 non-null  object
 5   Company                       26854 non-null  object
 6   State                         26854 non-null  object
 7   Tags                          26854 non-null  object
 8   Company response to consumer  26854 non-null  object
 9   Timely response?              26854 non-null  object
 10  Consumer disputed?            11149 non-null  object
 11  month                         26854 non-null  int64 
 12  year                          26854 non-null  int64 
dtypes: int64(2), obj

In [30]:
data.head()

Unnamed: 0,Date received,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company,State,Tags,Company response to consumer,Timely response?,Consumer disputed?,month,year
0,2020-05-19,Private student loan,Dealing with your lender or servicer,Received bad information about your loan,When I was applying for my loan my XXXX accoun...,"Figure Technologies, Inc",NJ,,Closed with explanation,Yes,,5,2020
1,2020-02-06,Federal student loan servicing,Incorrect information on your report,Account status incorrect,I'm on a deferred payment plan t never ; late,"Nelnet, Inc.",TX,,Closed with explanation,Yes,,2,2020
2,2020-02-08,Federal student loan servicing,Dealing with your lender or servicer,Problem with customer service,I have attempted multiple times to contact FED...,AES/PHEAA,KY,,Closed with non-monetary relief,Yes,,2,2020
3,2020-01-21,Federal student loan servicing,Dealing with your lender or servicer,Trouble with how payments are being handled,I was divorced in 2004 and I agreed to take th...,AES/PHEAA,OK,,Closed with explanation,Yes,,1,2020
4,2019-12-04,Federal student loan servicing,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,This particular account situation that is late...,AES/PHEAA,FL,,Closed with explanation,Yes,,12,2019


We will write a function to preprocess the data incase we need to resuse the process later (probably not since we are doing it to all our text data, but it's a good practice to keep).

In [31]:
# A function to preprocess all rows in a dataframe
def preprocess_data(data):
    # Change all text to lowercase
    data = data.apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    # Remove puctuation
    data = data.str.replace("[^\w\s]","")
    
    # Remove stopwords
    from nltk.corpus import stopwords
    stop = stopwords.words("english")
    data = data.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    # Remove common words
    freq = pd.Series(" ".join(data).split()).value_counts()[:10]
    freq = list(freq.index)
    data = data.apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    
    # Lemmatization
    from textblob import Word
    data = data.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    
    # Return transformed data
    return data

# Replace customer complaint text with a preprocessed version
data["Consumer complaint narrative"] = preprocess_data(data["Consumer complaint narrative"]).head()

In [32]:
data["Consumer complaint narrative"].head()

0    applying account correctly communicate issue o...
1                          im deferred plan never late
2    attempted multiple time contact fedloan via on...
3    divorced 2004 agreed take school divorce conso...
4    particular account situation lately filing cre...
Name: Consumer complaint narrative, dtype: object

In [33]:
data.head()

Unnamed: 0,Date received,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company,State,Tags,Company response to consumer,Timely response?,Consumer disputed?,month,year
0,2020-05-19,Private student loan,Dealing with your lender or servicer,Received bad information about your loan,applying account correctly communicate issue o...,"Figure Technologies, Inc",NJ,,Closed with explanation,Yes,,5,2020
1,2020-02-06,Federal student loan servicing,Incorrect information on your report,Account status incorrect,im deferred plan never late,"Nelnet, Inc.",TX,,Closed with explanation,Yes,,2,2020
2,2020-02-08,Federal student loan servicing,Dealing with your lender or servicer,Problem with customer service,attempted multiple time contact fedloan via on...,AES/PHEAA,KY,,Closed with non-monetary relief,Yes,,2,2020
3,2020-01-21,Federal student loan servicing,Dealing with your lender or servicer,Trouble with how payments are being handled,divorced 2004 agreed take school divorce conso...,AES/PHEAA,OK,,Closed with explanation,Yes,,1,2020
4,2019-12-04,Federal student loan servicing,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,particular account situation lately filing cre...,AES/PHEAA,FL,,Closed with explanation,Yes,,12,2019


The data has been preprocesses, which we double checked just above. Now we've prepared our data we are going to double check and make sure all the complaints reflect the marked issues. To do this we will perform topic modeling on the data. The algorithms we will use is the latent dirichlet allocation (LDA) model.

In [58]:
import re
from gensim import models, corpora
from nltk import word_tokenize

NUM_TOPICS = 10

# Tokenize text
data["tokenized text"] = data.apply(lambda row: nltk.word_tokenize(str(row["Consumer complaint narrative"])), axis=1)

print(data["tokenized text"])

# # Build a dictionary - association word to numeric id
# dictionary = corpora.Dictionary(data['Consumer complaint narrative'][0])

# # # Transform the collection of texts to a numerical form
# # corpus = [dictionary.doc2bow(text) for text in data['Consumer complaint narrative']]

# # # Look at the document
# # print(corpus[20])

# # # Build the LDA model
# # lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# # # Build the LSI model
# # lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

0        [applying, account, correctly, communicate, is...
1                        [im, deferred, plan, never, late]
2        [attempted, multiple, time, contact, fedloan, ...
3        [divorced, 2004, agreed, take, school, divorce...
4        [particular, account, situation, lately, filin...
                               ...                        
26849                                                [nan]
26850                                                [nan]
26851                                                [nan]
26852                                                [nan]
26853                                                [nan]
Name: tokenized text, Length: 26854, dtype: object


In [83]:
data["dictionary"] = corpora.Dictionary(data['tokenized text'])

data["dictionary"] = data["dictionary"].apply(lambda x:)

# # Transform the collection of texts to a numerical form
# corpus = [dictionary.doc2bow(text) for text in data['tokenized text']]

# # Look at the document
# print(corpus[20])

# # Build the LDA model
# lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# # Build the LSI model
# lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

ValueError: Length of values does not match length of index

In [82]:
dictionary[0:10]

TypeError: unhashable type: 'slice'

In [69]:
data["tokenized text"]

0        [applying, account, correctly, communicate, is...
1                        [im, deferred, plan, never, late]
2        [attempted, multiple, time, contact, fedloan, ...
3        [divorced, 2004, agreed, take, school, divorce...
4        [particular, account, situation, lately, filin...
                               ...                        
26849                                                [nan]
26850                                                [nan]
26851                                                [nan]
26852                                                [nan]
26853                                                [nan]
Name: tokenized text, Length: 26854, dtype: object

In [68]:
print(dictionary[0])

025


In [56]:
print("len:", len(data["tokenized text"]))
for word in data["tokenized text"][0]:
    print(word)

len: 26854
applying
account
correctly
communicate
issue
offer
025
rate
deduction
autopay
showing
account
told
go
application
anyway
account
opened
could
add
autopay
receive
discount
way
since
account
opened
called
call
center
least
4
time
trying
receive
autopay
discount
first
3
time
told
going
applied
still
seen
additionally
last
time
called
3
week
ago
asked
speak
manager
told
take
10
day
get
back
still
yet
hear
back
15
business
day
later
told
receiving
autopay
discount
receiving
opened
account
company
lying
rate
going
receive
dont
autopay
initiate
2
autopays
go
far
additional
issue
told
rate
going
based
1
month
libor
rate
published
wsj
month
none
rate
received
thus
far
match
rate
dont
really
know
tried
contacting
many
time
people
phone
seem
helpful
time
talking
nothing
seems
get
done
hang


In [36]:
from nltk.corpus import brown

other_data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    other_data.append(document)
    
print(len(other_data))
print(other_data[:5])

500


In [39]:
text_data = []

for row in data["Consumer complaint narrative"]:
    text_data.append(data["Consumer complaint narrative"][row])
    
text_data[:5]

KeyError: 'applying account correctly communicate issue offer 025 rate deduction autopay showing account told go application anyway account opened could add autopay receive discount way since account opened called call center least 4 time trying receive autopay discount first 3 time told going applied still seen additionally last time called 3 week ago asked speak manager told take 10 day get back still yet hear back 15 business day later told receiving autopay discount receiving opened account company lying rate going receive dont autopay initiate 2 autopays go far additional issue told rate going based 1 month libor rate published wsj month none rate received thus far match rate dont really know tried contacting many time people phone seem helpful time talking nothing seems get done hang'