In [2]:
import pandas as pd
import os
import json
import re
import numpy as np

from gensim.models import Nmf
from gensim.models.ldamulticore import LdaMulticore
from top2vec import Top2Vec
from bertopic import BERTopic

from gensim.corpora.dictionary import Dictionary

In [3]:
def get_processed_df(csv_path):
    df = pd.read_csv(csv_path)
    return df

In [4]:
df = get_processed_df('process_csv_stage_2.csv')
df

Unnamed: 0,old_index,preprocessed_text,topic_lda,topic_nmf,topic_t2v,topic_btp
0,1,good morning name appreciate could help put st...,0,5,0,34
1,2,upgraded card tell agent upgrade anniversary d...,3,3,0,2
2,10,chase card report however fraudulent applicati...,6,2,0,16
3,11,try book ticket come across offer apply toward...,3,7,0,24
4,14,grand son give check deposit chase account fun...,4,4,0,0
...,...,...,...,...,...,...
21067,78303,chase card customer well decade offer multiple...,3,2,0,2
21068,78309,wednesday call chas visa credit card provider ...,8,5,0,39
21069,78310,familiar pay understand great risk provide con...,2,1,0,9
21070,78311,flawless credit chase credit card chase freedo...,7,7,0,6


In [5]:
# load 4 topic models
load_lda = LdaMulticore.load(os.path.join(os.getcwd(),'topic_models',"final_lda"))
load_nmf = Nmf.load(os.path.join(os.getcwd(),'topic_models',"final_nmf"))
load_t2v = Top2Vec.load(os.path.join(os.getcwd(),'topic_models','final_t2v'))
load_bertopic = BERTopic.load(os.path.join(os.getcwd(),'topic_models','final_bertopic'))

load_lda_id2word = Dictionary.load(os.path.join(os.getcwd(),'topic_models',"final_lda.id2word"))

# LDA

In [6]:
df.value_counts('topic_lda')

topic_lda
0    4154
4    3192
7    2500
6    2435
1    2427
3    2008
8    2002
5    1397
2     957
Name: count, dtype: int64

In [7]:
lda_topic_indexes = []
lda_topic_names = []
lda_topics = load_lda.show_topics(num_topics=-1, num_words=10,formatted=False)
for top in lda_topics:
    lda_topic_indexes.append(top[0])
    # process the topic names
    name = '_'.join([word[0] for word in top[1]])
    lda_topic_names.append(name)
lda_topic_df = pd.DataFrame({'topic_id':lda_topic_indexes,'topic_name':lda_topic_names})
lda_topic_df

Unnamed: 0,topic_id,topic_name
0,0,call_tell_say_would_get_ask_back_could_time_phone
1,1,loan_mortgage_home_year_modification_property_...
2,2,debt_bank_consumer_complaint_law_state_morgan_...
3,3,card_credit_account_offer_use_apply_close_poin...
4,4,account_bank_check_fund_money_deposit_branch_c...
5,5,card_fraud_charge_transaction_claim_fraudulent...
6,6,credit_report_account_card_information_remove_...
7,7,payment_pay_fee_charge_make_balance_late_inter...
8,8,dispute_charge_receive_provide_refund_email_re...


# NMF

In [8]:
df.value_counts('topic_nmf')

topic_nmf
7    3834
5    3632
3    3151
1    2388
6    2204
4    2092
0    1740
2    1652
8     379
Name: count, dtype: int64

In [9]:
nmf_topic_indexes = []
nmf_topic_names = []
nmf_topics = load_nmf.show_topics(num_topics=-1, num_words=10,formatted=False)
for top in nmf_topics:
    nmf_topic_indexes.append(top[0])
    # process the topic names
    name = '_'.join([word[0] for word in top[1]])
    nmf_topic_names.append(name)
nmf_topic_df = pd.DataFrame({'topic_id':nmf_topic_indexes,'topic_name':nmf_topic_names})
nmf_topic_df

Unnamed: 0,topic_id,topic_name
0,0,call_tell_would_get_say_could_time_payment_pay...
1,1,charge_call_bank_dispute_would_fraud_transacti...
2,2,card_credit_charge_purchase_use_report_fraud_t...
3,3,account_close_open_fee_balance_transfer_transa...
4,4,check_account_bank_money_deposit_fund_branch_s...
5,5,receive_letter_information_provide_claim_send_...
6,6,loan_mortgage_card_home_service_document_prope...
7,7,credit_payment_pay_balance_late_month_make_due...
8,8,report_payment_consumer_act_debt_request_repor...


# Top2Vec

In [10]:
df.value_counts('topic_t2v')

topic_t2v
0    20234
1      240
2      240
3      125
4      112
5       77
6       44
Name: count, dtype: int64

In [11]:
t2v_topic_indexes = []
t2v_topic_names = []
t2v_topics_words, t2v_word_scores, t2v_topic_nums = load_t2v.get_topics()

t2v_topic_df = pd.DataFrame({'topic_id':list(t2v_topic_nums),'topic_name':['_'.join(top) for top in t2v_topics_words]})
t2v_topic_df

Unnamed: 0,topic_id,topic_name
0,0,cfpb_autopay_chase_wamu_overdrawn_garnishment_...
1,1,dispute_garnishment_creditor_defrauded_dispute...
2,2,creditor_autopay_creditors_debt_debts_credit_a...
3,3,autopay_cfpb_heloc_credit_refi_lender_refinanc...
4,4,autopay_refi_cfpb_lender_payment_payments_refi...
5,5,verified_unverified_chase_verification_fraudul...
6,6,usc_creditor_chase_garnishment_creditors_burea...


# BERTopic

In [12]:
df.value_counts('topic_btp')

topic_btp
 0     1923
-1     1044
 8      752
 1      708
 5      690
       ... 
 86      25
 85      23
 80      23
 88      19
 93      17
Name: count, Length: 95, dtype: int64

In [13]:
btp_topic_info = load_bertopic.get_topic_info()
btp_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10199,-1_the_to_and_my,"[the, to, and, my, that, was, chase, of, in, on]",
1,0,1774,0_check_the_account_my,"[check, the, account, my, and, to, was, bank, ...",
2,1,692,1_fees_overdraft_fee_account,"[fees, overdraft, fee, account, 3400, the, my ...",
3,2,687,2_points_card_offer_the,"[points, card, offer, the, bonus, annual, for,...",
4,3,544,3_inquiry_inquiries_credit_my credit,"[inquiry, inquiries, credit, my credit, hard, ...",
...,...,...,...,...,...
90,89,15,89_car_the car_rental_the rental,"[car, the car, rental, the rental, the, charge...",
91,90,15,90_identity_identity theft_theft_information,"[identity, identity theft, theft, information,...",
92,91,15,91_loan_closing_the_and,"[loan, closing, the, and, to, you, documents, ...",
93,92,15,92_tax_property tax_vehicle_lease,"[tax, property tax, vehicle, lease, property, ...",


# Export topics to csv

In [14]:
lda_topic_df.to_csv('lda_topic.csv',index=False)
nmf_topic_df.to_csv('nmf_topic.csv',index=False)
t2v_topic_df.to_csv('t2v_topic.csv',index=False)
btp_topic_info.to_csv('btp_topic.csv',index=False)