In [1]:
import pandas as pd
import time, datetime
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
 
import spacy
 
import pyLDAvis
import pyLDAvis.gensim # don't skip this

from eunjeon import Mecab
    
    
    
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
 
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
train = pd.read_csv("train_detoken.csv")
train.drop(['year_month','text','clean_doc'], axis=1, inplace=True)
test = pd.read_csv("test_detoken.csv")

In [3]:
lda_set = pd.concat([train.drop("smishing",axis=1),test], sort=False)
na_list = lda_set[lda_set['detoken'].isna()]['id']
lda_set['detoken'].fillna(" ", inplace=True)
lda_set['detoken'] = lda_set['detoken'].str.replace("은행", " ")
lda_set['detoken'] = lda_set['detoken'].str.replace("고객", " ")

In [4]:
len(lda_set[~lda_set.isin(na_list)]), len(lda_set), len(lda_set[~lda_set['id'].isin(na_list)])

(297571, 297571, 297528)

In [5]:
tagger = Mecab()

result = []
for i in lda_set['detoken']:
    result.append(tagger.nouns(i))
result[0]

['성산', '팀장', '행복', '주말']

In [6]:
lda_set['detoken'] = result
lda_set['detoken'].fillna(" ", inplace=True)

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
 
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
 
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [33]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):   
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        print("{0} topics model completed".format(num_topics))
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print(coherencemodel.get_coherence())
    return model_list, coherence_values

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(lda_set[~lda_set['id'].isin(na_list)]['detoken']))

print(data_words[:1])

[['성산', '팀장', '행복', '주말']]


In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
 
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
 
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['성산', '팀장', '행복', '주말']


In [11]:
# # Remove Stop Words
# data_words_nostops = remove_stopwords(data_words)
 
# # Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)
 
# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])
 
# # Do lemmatization keeping only noun, adj, vb, adv
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = data_words
print(data_lemmatized[:1])

[['성산', '팀장', '행복', '주말']]


In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
 
# Create Corpus
texts = data_lemmatized
 
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
 
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [26]:
import os
os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'})

In [37]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'C:/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, 
                                             corpus=corpus, 
                                             num_topics=2, 
                                             id2word=id2word)

In [38]:
lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [39]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
 
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  score += np.sum((self.eta - _lambda) * Elogbeta)
  score += np.sum(gammaln(_lambda) - gammaln(self.eta))



Perplexity:  nan

Coherence Score:  0.6606596659164341


In [40]:
lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list[1])
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
3     -0.250740 -0.093264       1        1  31.601609
0     -0.190966  0.092630       2        1  19.620373
1     -0.119304 -0.040280       3        1  18.600403
4      0.317809 -0.274128       4        1  15.455808
2      0.243201  0.315043       5        1  14.721807, topic_info=    Category           Freq Term          Total  loglift  logprob
9    Default  130861.000000   감사  130861.000000  30.0000  30.0000
25   Default  102631.000000   지점  102631.000000  29.0000  29.0000
3    Default   97343.000000   행복   97343.000000  28.0000  28.0000
54   Default   65364.000000   거래   65364.000000  27.0000  27.0000
8    Default   57237.000000   하루   57237.000000  26.0000  26.0000
..       ...            ...  ...            ...      ...      ...
3     Topic5   31260.052905   행복   97343.803065   0.7799  -3.5387
109   Topic5   18881.882577   전화   52703.660083   0

In [None]:
vis

In [18]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))
 
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('주말', 0.04153613272474304),
   ('행복', 0.039115251012509566),
   ('건강', 0.022678045198055236),
   ('팀장', 0.022186122034129396),
   ('마음', 0.021131779630817474),
   ('사랑', 0.01430605522554096),
   ('가을', 0.013468623823645158),
   ('시작', 0.012735774511717842),
   ('사람', 0.011856510273834645),
   ('이번', 0.010820760242072678)]),
 (1,
  [('금리', 0.037383329012185634),
   ('가능', 0.03712683803103819),
   ('상품', 0.03590966332086373),
   ('상담', 0.03360170747064706),
   ('등급', 0.031078002888995888),
   ('대출', 0.02830151116708026),
   ('신용', 0.024999074039779252),
   ('한도', 0.024556002074150894),
   ('통합', 0.01652514907959554),
   ('진행', 0.01584179043668284)]),
 (2,
  [('감사', 0.0957177645490395),
   ('지점', 0.07212239272230883),
   ('하루', 0.052314825984327344),
   ('거래', 0.04676347737161971),
   ('행복', 0.04627784137914396),
   ('동의', 0.03151808721679989),
   ('만족', 0.029589551977449058),
   ('오늘', 0.028760702086444764),
   ('저희', 0.02799255669449994),
   ('부탁', 0.024425778243351886)]),
 (3,

In [19]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.042*"주말" + 0.039*"행복" + 0.023*"건강" + 0.022*"팀장" + 0.021*"마음" + 0.014*"사랑" '
  '+ 0.013*"가을" + 0.013*"시작" + 0.012*"사람" + 0.011*"이번"'),
 (1,
  '0.037*"금리" + 0.037*"가능" + 0.036*"상품" + 0.034*"상담" + 0.031*"등급" + 0.028*"대출" '
  '+ 0.025*"신용" + 0.025*"한도" + 0.017*"통합" + 0.016*"진행"'),
 (2,
  '0.096*"감사" + 0.072*"지점" + 0.052*"하루" + 0.047*"거래" + 0.046*"행복" + 0.032*"동의" '
  '+ 0.030*"만족" + 0.029*"오늘" + 0.028*"저희" + 0.024*"부탁"'),
 (3,
  '0.027*"금융" + 0.025*"가입" + 0.021*"센터" + 0.020*"혜택" + 0.020*"연금" + 0.018*"종합" '
  '+ 0.018*"무료" + 0.017*"우대" + 0.016*"경우" + 0.016*"리브"'),
 (4,
  '0.037*"대출" + 0.036*"이용" + 0.019*"전화" + 0.018*"지점" + 0.018*"신청" + 0.017*"상품" '
  '+ 0.017*"안녕" + 0.014*"가능" + 0.014*"감사" + 0.014*"기간"')]


In [None]:
mallet_path = 'C:/mallet-2.0.8/bin/mallet' # update this path
model_list, coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus, texts=data_lemmatized, 
                                                        start=17, limit=33, step=3)

17 topics model completed
0.6845119859827744
20 topics model completed
0.685753961238262
23 topics model completed
0.6728262354568272
26 topics model completed
0.6345309857390331
29 topics model completed
0.646266327529815


In [19]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.167*"감사" + 0.115*"하루" + 0.100*"시간" + 0.079*"올림" + 0.075*"거래" + 0.051*"행복" '
  '+ 0.046*"오늘" + 0.042*"내점" + 0.035*"저녁" + 0.034*"진심"'),
 (1,
  '0.037*"금리" + 0.033*"상담" + 0.030*"가능" + 0.027*"등급" + 0.026*"진행" + 0.024*"추가" '
  '+ 0.022*"상품" + 0.022*"이상" + 0.020*"통합" + 0.019*"부채"'),
 (2,
  '0.100*"대출" + 0.026*"금리" + 0.017*"신용" + 0.014*"방문" + 0.014*"서류" + 0.012*"자금" '
  '+ 0.012*"사업자" + 0.011*"안내" + 0.011*"증명" + 0.010*"지점"'),
 (3,
  '0.091*"동의" + 0.085*"만족" + 0.082*"부탁" + 0.068*"업무" + 0.059*"전화" + 0.053*"처리" '
  '+ 0.042*"계장" + 0.038*"조사" + 0.036*"추천" + 0.027*"설문"'),
 (4,
  '0.038*"마음" + 0.037*"가을" + 0.036*"사람" + 0.026*"조심" + 0.023*"주말" + 0.022*"사랑" '
  '+ 0.021*"가족" + 0.020*"감기" + 0.019*"바람" + 0.018*"계절"'),
 (5,
  '0.116*"금융" + 0.073*"센터" + 0.050*"종합" + 0.038*"안녕" + 0.037*"연락" + 0.035*"전담" '
  '+ 0.029*"사항" + 0.021*"예금" + 0.021*"확인" + 0.017*"언제"'),
 (6,
  '0.053*"연금" + 0.050*"혜택" + 0.045*"가입" + 0.035*"경우" + 0.032*"공제" + 0.029*"납입" '
  '+ 0.028*"세액" + 0.027*"금액" + 0.027*"퇴직" + 0.025