In [1]:
import pandas as pd
import os
import json
import re
import numpy as np

from gensim.models import Nmf
from gensim.models.ldamulticore import LdaMulticore
from top2vec import Top2Vec
from bertopic import BERTopic

from gensim.corpora.dictionary import Dictionary

# loading dataset, topic models, and index-topic dictionary

In [2]:
def get_raw_df():
    # load raw complaint json into raw df 
    raw = pd.json_normalize(json.load(open("complaints-2021-05-14_08_16_.json")))

    # renaming column
    raw.columns = ['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
        'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
        'date_sent_to_company', 'company_public_response','sub_product', 'timely',
        'complaint_what_happened', 'sub_issue','consumer_consent_provided']

    # drop unnecessary columns
    raw.drop(['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
        'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
        'date_sent_to_company', 'company_public_response','sub_product', 'timely',
        'sub_issue','consumer_consent_provided'],axis=1,inplace=True)

    # drop null values
    raw['complaint_what_happened'].replace(r'^\s*$', np.nan, regex=True, inplace=True)
    raw.dropna(inplace=True)

    # clear masked values
    def remove_mask(text):
        masked_text_pattern = r"\b[x|X]{2,20}"
        return re.sub(masked_text_pattern,'',text)
    
    return raw.complaint_what_happened.apply(remove_mask)

In [3]:
def get_processed_df():
    df = pd.read_csv('process_csv_stage_1.csv')
    df.columns = ['old_index','preprocessed_text']
    return df

In [4]:
raw = get_raw_df()
raw.reset_index(drop=True,inplace=True)
raw

0        Good morning my name is   and I appreciate it ...
1        I upgraded my   card in //2018 and was told by...
2        Chase Card was reported on //2019. However, fr...
3        On //2018, while trying to book a     ticket, ...
4        my grand son give me check for {$1600.00} i de...
                               ...                        
21067    After being a Chase Card customer for well ove...
21068    On Wednesday, // I called Chas, my   Visa Cred...
21069    I am not familiar with  pay and did not unders...
21070    I have had flawless credit for 30 yrs. I've ha...
21071    Roughly 10+ years ago I closed out my accounts...
Name: complaint_what_happened, Length: 21072, dtype: object

In [5]:
process_df = get_processed_df()
process_df['preprocessed_text'] = process_df['preprocessed_text'].str.split(' ')
process_df

Unnamed: 0,old_index,preprocessed_text
0,1,"[good, morning, name, appreciate, could, help,..."
1,2,"[upgraded, card, tell, agent, upgrade, anniver..."
2,10,"[chase, card, report, however, fraudulent, app..."
3,11,"[try, book, ticket, come, across, offer, apply..."
4,14,"[grand, son, give, check, deposit, chase, acco..."
...,...,...
21067,78303,"[chase, card, customer, well, decade, offer, m..."
21068,78309,"[wednesday, call, chas, visa, credit, card, pr..."
21069,78310,"[familiar, pay, understand, great, risk, provi..."
21070,78311,"[flawless, credit, chase, credit, card, chase,..."


In [6]:
# load 4 topic models
load_lda = LdaMulticore.load(os.path.join(os.getcwd(),'topic_models',"final_tune_lda"))
load_nmf = Nmf.load(os.path.join(os.getcwd(),'topic_models',"final_tune_nmf"))
load_t2v = Top2Vec.load(os.path.join(os.getcwd(),'topic_models','final_t2v'))
load_bertopic = BERTopic.load(os.path.join(os.getcwd(),'topic_models','final_bertopic'))

In [7]:
load_lda_id2word = Dictionary.load(os.path.join(os.getcwd(),'topic_models',"final_tune_lda.id2word"))

# label the ticket category according to four different topic models

In [8]:
def get_topic_lda(text, load_lda, load_id2word):
    bow = load_id2word.doc2bow(text)
    topics = sorted(load_lda[bow][0], key=lambda tup: -1*tup[1])
    return topics[0][0]

In [9]:
def get_topic_nmf(text, load_nmf, load_id2word):
    bow = load_id2word.doc2bow(text)
    topics = sorted(load_nmf[bow], key=lambda tup: -1*tup[1])
    return topics[0][0]

In [10]:
def get_topic_t2v(text, load_t2v):
    t2v_topics_words, t2v_word_scores, t2v_topic_scores, t2v_topic_nums =load_t2v.query_topics(text,1)
    return t2v_topic_nums[0]

In [11]:
def get_topic_btp(text, load_btp):
    predicted_topics, predicted_probs = load_btp.transform(text)
    return predicted_topics[0]

In [12]:
process_df['topic_lda'] = process_df['preprocessed_text'].apply(get_topic_lda,load_lda=load_lda,load_id2word = load_lda_id2word)

In [13]:
process_df['topic_nmf'] = process_df['preprocessed_text'].apply(get_topic_nmf,load_nmf=load_nmf,load_id2word = load_lda_id2word)

In [14]:
process_df['topic_t2v'] = raw.apply(get_topic_t2v,load_t2v=load_t2v)

In [15]:
process_df['topic_btp'] = raw.apply(get_topic_btp,load_btp=load_bertopic)

In [16]:
process_df['preprocessed_text'] = process_df['preprocessed_text'].str.join(sep=' ')
process_df

Unnamed: 0,old_index,preprocessed_text,topic_lda,topic_nmf,topic_t2v,topic_btp
0,1,good morning name appreciate could help put st...,2,2,0,34
1,2,upgraded card tell agent upgrade anniversary d...,5,6,0,2
2,10,chase card report however fraudulent applicati...,2,4,0,16
3,11,try book ticket come across offer apply toward...,5,4,0,24
4,14,grand son give check deposit chase account fun...,6,1,0,0
...,...,...,...,...,...,...
21067,78303,chase card customer well decade offer multiple...,5,6,0,2
21068,78309,wednesday call chas visa credit card provider ...,8,5,0,39
21069,78310,familiar pay understand great risk provide con...,3,2,0,9
21070,78311,flawless credit chase credit card chase freedo...,4,0,0,6


In [17]:
process_df.to_csv("process_csv_stage_2.csv",index=False)