In [1]:
import pandas as pd
import os
import json
import re
import numpy as np

from gensim.models import Nmf
from gensim.models.ldamulticore import LdaMulticore
from top2vec import Top2Vec
from bertopic import BERTopic

from gensim.corpora.dictionary import Dictionary

In [2]:
def get_processed_df(csv_path):
    df = pd.read_csv(csv_path)
    return df

In [3]:
df = get_processed_df('process_csv_stage_2.csv')
df

Unnamed: 0,old_index,preprocessed_text,topic_lda,topic_nmf,topic_t2v,topic_btp
0,1,"['good', 'morning', 'name', 'appreciate', 'cou...",22,11,0,0
1,2,"['upgraded', 'card', 'tell', 'agent', 'upgrade...",26,10,0,44
2,10,"['chase', 'card', 'report', 'however', 'fraudu...",33,12,0,0
3,11,"['try', 'book', 'ticket', 'come', 'across', 'o...",26,7,0,0
4,14,"['grand', 'son', 'give', 'check', 'deposit', '...",25,4,0,56
...,...,...,...,...,...,...
21067,78303,"['chase', 'card', 'customer', 'well', 'decade'...",26,12,0,0
21068,78309,"['wednesday', 'call', 'chas', 'visa', 'credit'...",2,9,0,26
21069,78310,"['familiar', 'pay', 'understand', 'great', 'ri...",34,5,0,12
21070,78311,"['flawless', 'credit', 'chase', 'credit', 'car...",7,10,0,1


In [21]:
df.value_counts('topic_lda')

topic_lda
32    1472
29    1382
22    1381
33    1371
10    1350
25    1298
1      983
9      977
6      915
27     886
7      865
23     731
11     710
26     649
0      610
31     603
21     587
24     506
28     419
8      391
19     318
18     317
15     298
2      290
16     262
17     225
13     218
14     213
30     206
34     152
20     140
4      119
3       96
12      84
5       48
Name: count, dtype: int64

In [22]:
df.value_counts('topic_nmf')

topic_nmf
5     5709
10    2150
7     1842
1     1660
12    1633
13    1600
0     1396
3     1188
2     1021
4      812
11     649
9      635
6      342
8      259
14     176
Name: count, dtype: int64

In [23]:
df.value_counts('topic_t2v')

topic_t2v
0    20234
1      240
2      240
3      125
4      112
5       77
6       44
Name: count, dtype: int64

In [24]:
df.value_counts('topic_btp')

topic_btp
-1     4337
 0     4069
 2     1012
 1      992
 6      644
       ... 
 55      33
 57      32
 59      30
 40      26
 48      22
Name: count, Length: 63, dtype: int64

In [4]:
# load 4 topic models
load_lda = LdaMulticore.load(os.path.join(os.getcwd(),'topic_models',"final_lda"))
load_nmf = Nmf.load(os.path.join(os.getcwd(),'topic_models',"final_nmf"))
load_t2v = Top2Vec.load(os.path.join(os.getcwd(),'topic_models','final_t2v'))
load_bertopic = BERTopic.load(os.path.join(os.getcwd(),'topic_models','final_bertopic'))

load_lda_id2word = Dictionary.load(os.path.join(os.getcwd(),'topic_models',"final_lda.id2word"))

# forming index-topic mapping df for 4 topic models

In [13]:
lda_topic_indexes = []
lda_topic_names = []
lda_topics = load_lda.show_topics(num_topics=-1, num_words=10,formatted=False)
for top in lda_topics:
    lda_topic_indexes.append(top[0])
    # process the topic names
    name = '_'.join([word[0] for word in top[1]])
    lda_topic_names.append(name)
lda_topic_df = pd.DataFrame({'topic_id':lda_topic_indexes,'topic_name':lda_topic_names})
lda_topic_df

Unnamed: 0,topic_id,topic_name
0,0,number_call_phone_mail_name_ask_say_informatio...
1,1,receive_letter_call_send_would_state_request_d...
2,2,item_order_return_purchase_receive_deliver_pro...
3,3,travel_benefit_card_ovid_trip_holder_pandemic_...
4,4,transfer_account_complaint_customer_service_de...
5,5,fraud_signature_daughter_case_document_police_...
6,6,account_close_open_bank_reason_check_card_with...
7,7,interest_balance_pay_payment_charge_statement_...
8,8,refund_cancel_ticket_charge_flight_credit_card...
9,9,charge_card_credit_fraudulent_fraud_purchase_m...


In [14]:
nmf_topic_indexes = []
nmf_topic_names = []
nmf_topics = load_nmf.show_topics(num_topics=-1, num_words=10,formatted=False)
for top in nmf_topics:
    nmf_topic_indexes.append(top[0])
    # process the topic names
    name = '_'.join([word[0] for word in top[1]])
    nmf_topic_names.append(name)
nmf_topic_df = pd.DataFrame({'topic_id':nmf_topic_indexes,'topic_name':nmf_topic_names})
nmf_topic_df

Unnamed: 0,topic_id,topic_name
0,0,payment_bank_mortgage_make_pay_late_month_loan...
1,1,charge_dispute_receive_letter_fee_state_date_p...
2,2,call_would_get_say_tell_back_could_time_pay_ask
3,3,loan_mortgage_home_modification_property_docum...
4,4,check_deposit_day_fund_cash_money_amount_pay_c...
5,5,account_bank_fraud_money_charge_use_customer_f...
6,6,report_consumer_act_information_request_report...
7,7,account_credit_report_close_open_issue_check_l...
8,8,card_service_use_fraud_charge_point_receive_pu...
9,9,claim_call_number_department_transaction_say_f...


In [20]:
t2v_topic_indexes = []
t2v_topic_names = []
t2v_topics_words, t2v_word_scores, t2v_topic_nums = load_t2v.get_topics()

t2v_topic_df = pd.DataFrame({'topic_id':list(t2v_topic_nums),'topic_name':['_'.join(top) for top in t2v_topics_words]})
t2v_topic_df

Unnamed: 0,topic_id,topic_name
0,0,chexsystems_cfpb_autopay_bofa_lifelock_usaa_ci...
1,1,experian_dispute_lifelock_transunion_garnishme...
2,2,creditor_autopay_creditors_debtor_debt_indebte...
3,3,transunion_autopay_chexsystems_cfpb_preapprova...
4,4,preapproval_autopay_chexsystems_refi_cfpb_forc...
5,5,verified_unverified_verifications_verifies_cha...
6,6,usc_chexsystems_creditor_transunion_warrents_c...


In [11]:
btp_topic_info = load_bertopic.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10960,-1_the_to_and_my,"[the, to, and, my, that, was, chase, of, in, on]",
1,0,5542,0_the_to_and_my,"[the, to, and, my, chase, that, was, on, credi...",
2,1,695,1_fees_overdraft_account_fee,"[fees, overdraft, account, fee, the, to, my, m...",
3,2,349,2_modification_loan_mortgage_the,"[modification, loan, mortgage, the, to, and, m...",
4,3,287,3_amazon_card_credit_the,"[amazon, card, credit, the, to, and, chase, my...",
...,...,...,...,...,...
58,57,15,57_tax_property tax_vehicle_lease,"[tax, property tax, vehicle, lease, property, ...",
59,58,15,58_stimulus_irs_the irs_stimulus check,"[stimulus, irs, the irs, stimulus check, check...",
60,59,15,59_jpmchase_il_my parents_parents,"[jpmchase, il, my parents, parents, jpmchase t...",
61,60,15,60_wamu_the_of_loan,"[wamu, the, of, loan, to, in, homeowner, was, ...",
