In [1]:
import pandas as pd
import os
import json
import re
import numpy as np

from gensim.models import Nmf
from gensim.models.ldamulticore import LdaMulticore
from top2vec import Top2Vec
from bertopic import BERTopic

from gensim.corpora.dictionary import Dictionary

In [2]:
# load raw complaint json into raw df 
raw = pd.json_normalize(json.load(open("complaints-2021-05-14_08_16_.json")))

# renaming column
raw.columns = ['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'complaint_what_happened', 'sub_issue','consumer_consent_provided']

# drop unnecessary columns
raw.drop(['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'sub_issue','consumer_consent_provided'],axis=1,inplace=True)

# drop null values
raw['complaint_what_happened'].replace(r'^\s*$', np.nan, regex=True, inplace=True)
raw.dropna(inplace=True)

# clear masked values
def remove_mask(text):
    masked_text_pattern = r"\b[x|X]{2,20}"
    return re.sub(masked_text_pattern,'',text)

removed_mask = raw.complaint_what_happened.apply(remove_mask)

In [3]:
# import process_csv_stage_1.csv
df = pd.read_csv('process_csv_stage_1.csv')
df.columns = ['old_index','preprocessed_text']
df

Unnamed: 0,old_index,preprocessed_text
0,1,good morning name appreciate could help put st...
1,2,upgraded card tell agent upgrade anniversary d...
2,10,chase card report however fraudulent applicati...
3,11,try book ticket come across offer apply toward...
4,14,grand son give check deposit chase account fun...
...,...,...
21067,78303,chase card customer well decade offer multiple...
21068,78309,wednesday call chas visa credit card provider ...
21069,78310,familiar pay understand great risk provide con...
21070,78311,flawless credit chase credit card chase freedo...


In [4]:
# test_string = df[df['old_index']==42]['preprocessed_text']
df['preprocessed_text'][df[df['old_index']==42].index[0]]
# print(test_string.index)

'card miss realize today check email say deposit money take bank account'

In [5]:
# load 4 topic models
load_lda = LdaMulticore.load(os.path.join(os.getcwd(),'topic_models',"final_lda"))
load_nmf = Nmf.load(os.path.join(os.getcwd(),'topic_models',"final_nmf"))
load_t2v = Top2Vec.load(os.path.join(os.getcwd(),'topic_models','final_t2v'))
load_bertopic = BERTopic.load(os.path.join(os.getcwd(),'topic_models','final_bertopic'))

- label generated topics to the df
- resulting df: preprocessed_text | topic_nmf | topic_lda | topic_t2v | topic_btp
- and their index-topic matrix

In [6]:
load_lda_id2word = Dictionary.load(os.path.join(os.getcwd(),'topic_models',"final_lda.id2word"))

In [7]:
for i in load_lda_id2word.iteritems():
    print(i)

(0, 'acceptable')
(1, 'account')
(2, 'advance')
(3, 'appreciate')
(4, 'ask')
(5, 'attempt')
(6, 'bank')
(7, 'card')
(8, 'collect')
(9, 'consumer')
(10, 'could')
(11, 'debt')
(12, 'every')
(13, 'good')
(14, 'help')
(15, 'information')
(16, 'instead')
(17, 'know')
(18, 'mail')
(19, 'member')
(20, 'month')
(21, 'morning')
(22, 'name')
(23, 'put')
(24, 'receive')
(25, 'right')
(26, 'send')
(27, 'service')
(28, 'statement')
(29, 'stop')
(30, 'thanks')
(31, 'validate')
(32, 'verification')
(33, 'write')
(34, 'agent')
(35, 'anniversary')
(36, 'change')
(37, 'consent')
(38, 'date')
(39, 'give')
(40, 'mislead')
(41, 'order')
(42, 'record')
(43, 'tell')
(44, 'turned')
(45, 'upgrade')
(46, 'upgraded')
(47, 'without')
(48, 'would')
(49, 'wrong')
(50, 'applicant')
(51, 'application')
(52, 'credit')
(53, 'extend')
(54, 'fraudulent')
(55, 'fraudulently')
(56, 'however')
(57, 'identity')
(58, 'obtain')
(59, 'report')
(60, 'submit')
(61, 'verify')
(62, 'absolutely')
(63, 'across')
(64, 'ahead')
(65, 'a

In [8]:
texts = df['preprocessed_text'].str.split(' ')

corpus = [load_lda_id2word.doc2bow(text) for text in texts]

document_num = 17
bow_doc_x = corpus[document_num]

for i in range(len(bow_doc_x)):
    print('Word {} ("{}") appears {} time.'.format(bow_doc_x[i][0],
    load_lda_id2word[bow_doc_x[i][0]],
    bow_doc_x[i][1]))

Word 1 ("account") appears 1 time.
Word 6 ("bank") appears 1 time.
Word 7 ("card") appears 1 time.
Word 119 ("check") appears 1 time.
Word 124 ("deposit") appears 1 time.
Word 127 ("money") appears 1 time.
Word 134 ("say") appears 1 time.
Word 136 ("take") appears 1 time.
Word 190 ("realize") appears 1 time.
Word 290 ("email") appears 1 time.
Word 441 ("today") appears 1 time.
Word 463 ("miss") appears 1 time.


# LDA

In [9]:
load_lda[bow_doc_x][0]

[(13, 0.09195252), (22, 0.3652678), (25, 0.31209028), (28, 0.16255431)]

In [10]:
for index, score in sorted(load_lda[bow_doc_x][0], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {:3d}: {}".format(score, index, load_lda.print_topic(index, 5)))

Score: 0.36522209644317627	 Topic  22: 0.049*"money" + 0.036*"get" + 0.024*"account" + 0.024*"bank" + 0.023*"say"
Score: 0.312234491109848	 Topic  25: 0.118*"check" + 0.048*"deposit" + 0.047*"bank" + 0.043*"account" + 0.032*"fund"
Score: 0.1624535620212555	 Topic  28: 0.074*"card" + 0.048*"debit" + 0.031*"atm" + 0.027*"account" + 0.024*"use"
Score: 0.0919547900557518	 Topic  13: 0.073*"email" + 0.041*"send" + 0.036*"message" + 0.032*"address" + 0.026*"number"


# NMF

In [11]:
load_nmf[bow_doc_x]

[(0, 0.025997716189227405),
 (2, 0.04749748742572338),
 (4, 0.1611679735736758),
 (5, 0.2523993759224312),
 (7, 0.07604999347470942),
 (8, 0.08218377258963519),
 (9, 0.04191751718605628),
 (10, 0.08251921967412937),
 (11, 0.12145675534223431),
 (12, 0.013621791630664997),
 (14, 0.08878100936061033)]

In [12]:
for index, score in sorted(load_nmf[bow_doc_x], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {:3d}: {}".format(score, index, load_nmf.print_topic(index, 5)))

Score: 0.23983018601938977	 Topic   5: 0.066*"account" + 0.053*"bank" + 0.041*"fraud" + 0.035*"money" + 0.035*"charge"
Score: 0.15995444541008658	 Topic   4: 0.190*"check" + 0.043*"deposit" + 0.030*"day" + 0.025*"fund" + 0.018*"cash"
Score: 0.11641518338921292	 Topic  11: 0.062*"bank" + 0.034*"send" + 0.032*"would" + 0.029*"tell" + 0.021*"money"
Score: 0.09236939137737156	 Topic  14: 0.224*"account" + 0.026*"close" + 0.019*"open" + 0.015*"fund" + 0.012*"branch"
Score: 0.0869759977276078	 Topic  10: 0.076*"card" + 0.046*"make" + 0.040*"transaction" + 0.038*"balance" + 0.033*"fee"
Score: 0.07848206487675408	 Topic   8: 0.149*"card" + 0.019*"service" + 0.016*"use" + 0.014*"fraud" + 0.014*"charge"
Score: 0.07147723375312962	 Topic   7: 0.114*"account" + 0.112*"credit" + 0.037*"report" + 0.035*"close" + 0.030*"open"
Score: 0.05449411773037099	 Topic   2: 0.043*"call" + 0.043*"would" + 0.042*"get" + 0.037*"say" + 0.026*"tell"
Score: 0.042079191184866936	 Topic   9: 0.056*"claim" + 0.024*"cal

# Top2Vec

In [33]:
t2v_topics_words, t2v_word_scores, t2v_topic_scores, t2v_topic_nums =load_t2v.query_topics(removed_mask[42],1)
t2v_predict_words_df = pd.DataFrame({'topics_words':list(t2v_topics_words[0]),'word_scores':list(t2v_word_scores[0])})
print("Predicted Topic Number {}\t Score: {}".format(t2v_topic_nums[0],t2v_topic_scores[0]))
t2v_predict_words_df

Predicted Topic Number 0	 Score: 0.5731476545333862


Unnamed: 0,topics_words,word_scores
0,chexsystems,0.463706
1,cfpb,0.436052
2,autopay,0.426086
3,bofa,0.413042
4,lifelock,0.40083
5,usaa,0.386326
6,citibank,0.385421
7,repoed,0.382002
8,chase,0.378646
9,wamu,0.37788


# BERTopic

In [14]:
removed_mask[42]

'My card went missing and i didnt realize it until today, i checked my email and they said there has been deposits and money taken out again. I had {$400.00} in my bank account and its gone.'

In [15]:
predicted_topics, predicted_probs = load_bertopic.transform(removed_mask[42])

In [16]:
predicted_topics

array([0], dtype=int64)

In [17]:
predicted_probs

array([0.5369783], dtype=float32)

In [18]:
btp_info = load_bertopic.get_topic_info()
btp_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10960,-1_the_to_and_my,"[the, to, and, my, that, was, chase, of, in, on]",
1,0,5542,0_the_to_and_my,"[the, to, and, my, chase, that, was, on, credi...",
2,1,695,1_fees_overdraft_account_fee,"[fees, overdraft, account, fee, the, to, my, m...",
3,2,349,2_modification_loan_mortgage_the,"[modification, loan, mortgage, the, to, and, m...",
4,3,287,3_amazon_card_credit_the,"[amazon, card, credit, the, to, and, chase, my...",
...,...,...,...,...,...
58,57,15,57_tax_property tax_vehicle_lease,"[tax, property tax, vehicle, lease, property, ...",
59,58,15,58_stimulus_irs_the irs_stimulus check,"[stimulus, irs, the irs, stimulus check, check...",
60,59,15,59_jpmchase_il_my parents_parents,"[jpmchase, il, my parents, parents, jpmchase t...",
61,60,15,60_wamu_the_of_loan,"[wamu, the, of, loan, to, in, homeowner, was, ...",


In [19]:
# predicted_topic = btp_info[btp_info.Topic==predicted_topics[0]]
btp_info["Name"][predicted_topics[0]+1]
# f'Topic {predicted_topic["Name"]}'

'0_the_to_and_my'

In [20]:
btp_info["Representation"][predicted_topics[0]+1]

['the', 'to', 'and', 'my', 'chase', 'that', 'was', 'on', 'credit', 'of']