In [1]:
import pandas as pd
import os
import json
import re
import numpy as np

from gensim.models import Nmf
from gensim.models.ldamulticore import LdaMulticore
from top2vec import Top2Vec
from bertopic import BERTopic

from gensim.corpora.dictionary import Dictionary

In [2]:
def get_processed_df(csv_path):
    df = pd.read_csv(csv_path)
    return df

In [4]:
df = get_processed_df('process_csv_stage_2.csv')
df

Unnamed: 0,old_index,preprocessed_text,topic_lda,topic_nmf,topic_t2v,topic_btp
0,1,"['good', 'morning', 'name', 'appreciate', 'cou...",22,11,0,0
1,2,"['upgraded', 'card', 'tell', 'agent', 'upgrade...",26,10,0,44
2,10,"['chase', 'card', 'report', 'however', 'fraudu...",33,12,0,0
3,11,"['try', 'book', 'ticket', 'come', 'across', 'o...",26,7,0,0
4,14,"['grand', 'son', 'give', 'check', 'deposit', '...",25,4,0,56
...,...,...,...,...,...,...
21067,78303,"['chase', 'card', 'customer', 'well', 'decade'...",26,12,0,0
21068,78309,"['wednesday', 'call', 'chas', 'visa', 'credit'...",2,9,0,26
21069,78310,"['familiar', 'pay', 'understand', 'great', 'ri...",34,5,0,12
21070,78311,"['flawless', 'credit', 'chase', 'credit', 'car...",7,10,0,1


In [5]:
# load 4 topic models
load_lda = LdaMulticore.load(os.path.join(os.getcwd(),'topic_models',"final_lda"))
load_nmf = Nmf.load(os.path.join(os.getcwd(),'topic_models',"final_nmf"))
load_t2v = Top2Vec.load(os.path.join(os.getcwd(),'topic_models','final_t2v'))
load_bertopic = BERTopic.load(os.path.join(os.getcwd(),'topic_models','final_bertopic'))

load_lda_id2word = Dictionary.load(os.path.join(os.getcwd(),'topic_models',"final_lda.id2word"))

In [7]:
load_lda.show_topics(num_topics=-1, num_words=10,formatted=False)

[(0,
  [('number', 0.045175977),
   ('call', 0.044429593),
   ('phone', 0.022473436),
   ('mail', 0.020355199),
   ('name', 0.019982629),
   ('ask', 0.019960362),
   ('say', 0.019025449),
   ('information', 0.017310657),
   ('address', 0.0172577),
   ('card', 0.01657906)]),
 (1,
  [('receive', 0.042533588),
   ('letter', 0.038389724),
   ('call', 0.033241987),
   ('send', 0.021684574),
   ('would', 0.020890666),
   ('state', 0.01941405),
   ('request', 0.017950587),
   ('day', 0.014303187),
   ('time', 0.014046169),
   ('contact', 0.0129366545)]),
 (2,
  [('item', 0.041874003),
   ('order', 0.03923838),
   ('return', 0.028201785),
   ('purchase', 0.02439643),
   ('receive', 0.017967032),
   ('deliver', 0.014291601),
   ('provide', 0.014221285),
   ('package', 0.013579315),
   ('amazon', 0.0133606),
   ('ship', 0.013053289)]),
 (3,
  [('travel', 0.044437397),
   ('benefit', 0.03801707),
   ('card', 0.033077803),
   ('ovid', 0.020756269),
   ('trip', 0.01773028),
   ('holder', 0.01691246

In [8]:
load_nmf.show_topics(num_topics=-1, num_words=10,formatted=False)

[(0,
  [('payment', 0.1477028716568463),
   ('bank', 0.04879491467247396),
   ('mortgage', 0.03523791870658215),
   ('make', 0.03174185876243133),
   ('pay', 0.030653416368943486),
   ('late', 0.021119967739925173),
   ('month', 0.02026675839407351),
   ('loan', 0.01554597313773468),
   ('due', 0.014913002712180136),
   ('year', 0.014905893992374684)]),
 (1,
  [('charge', 0.043812247216827575),
   ('dispute', 0.029672381997809125),
   ('receive', 0.02249371929340892),
   ('letter', 0.016614540762213317),
   ('fee', 0.015187347689960197),
   ('state', 0.01496472065369569),
   ('date', 0.01389269686530772),
   ('provide', 0.013642209479140749),
   ('merchant', 0.011904249786874837),
   ('refund', 0.01169346029256579)]),
 (2,
  [('call', 0.043169911781447974),
   ('would', 0.04255644980244704),
   ('get', 0.04159881635216732),
   ('say', 0.03748529547226943),
   ('tell', 0.02560222025607286),
   ('back', 0.019492796477472173),
   ('could', 0.018103046486278335),
   ('time', 0.017256420099

In [9]:
t2v_topics_words, t2v_word_scores, t2v_topic_nums = load_t2v.get_topics()

In [10]:
t2v_topics_words

array([['chexsystems', 'cfpb', 'autopay', 'bofa', 'lifelock', 'usaa',
        'citibank', 'repoed', 'chase', 'wamu', 'overdrawn', 'kyc',
        'wellsfargo', 'transunion', 'garnishment', 'preapproval', 'pnc',
        'experian', 'chargebacks', 'echeck', 'churning', 'defaulted',
        'citi', 'overdrafted', 'garnishments', 'overdrafts',
        'chargeback', 'overdraft', 'garnished', 'refinanced', 'bank',
        'defrauded', 'fdic', 'creditor', 'creditors', 'heloc', 'banking',
        'reposession', 'refi', 'insolvent', 'equifax', 'creditcard',
        'debtors', 'amex', 'receivables', 'santander', 'overdraw',
        'fraudsters', 'lender', 'hsbc'],
       ['experian', 'dispute', 'lifelock', 'transunion', 'garnishment',
        'creditor', 'chexsystems', 'equifax', 'defrauded', 'repoed',
        'receivables', 'disputes', 'garnished', 'creditors', 'disputing',
        'litigation', 'garnishing', 'collections', 'fraudulent',
        'bureaus', 'autopay', 'forclosure', 'debtor', 'bai

In [11]:
t2v_word_scores

array([[0.46370572, 0.43605196, 0.42608595, 0.41304153, 0.4008296 ,
        0.38632601, 0.38542113, 0.38200244, 0.37864625, 0.37788004,
        0.3770043 , 0.37140572, 0.37052795, 0.36970824, 0.3658105 ,
        0.3594699 , 0.35657394, 0.35613358, 0.35557187, 0.35363078,
        0.34866542, 0.34795815, 0.34740597, 0.34513932, 0.34031385,
        0.3396248 , 0.33903915, 0.3389546 , 0.33420798, 0.33093202,
        0.33005804, 0.32784072, 0.3278141 , 0.32628638, 0.32610303,
        0.32451338, 0.32425547, 0.32315218, 0.32305908, 0.3218513 ,
        0.3213454 , 0.32028922, 0.31769252, 0.3176263 , 0.31715724,
        0.3154136 , 0.30872297, 0.30864674, 0.30856234, 0.30735195],
       [0.38854337, 0.36430746, 0.35050845, 0.34649712, 0.3195392 ,
        0.31505477, 0.31088   , 0.30386302, 0.30373734, 0.29891717,
        0.29870796, 0.29715556, 0.29332998, 0.29220495, 0.28834844,
        0.28606564, 0.28340527, 0.2814926 , 0.28024864, 0.279635  ,
        0.2780338 , 0.27792847, 0.27776366, 0.2

In [12]:
t2v_topic_nums

array([0, 1, 2, 3, 4, 5, 6])

In [None]:
load_bertopic.get_topic_info()