## Mallet-TM : 
### Extract Topic Model with LdaMallet from KCC News and CIS test data

In [1]:
# -*- coding: utf-8 -*-
"""
Input: "KCC Data/CkipNewsCisTest-KccDict2020.txt"
Output: "KCC Data/NewsCisTest-KccDict2020-MalletNum12.txt"

@author: johnson
"""
# When numpy version = 1.19.2 => gensim version should downgrade to 3.8.3
from gensim import corpora, models, utils
from gensim.models import LdaModel
from gensim.models import CoherenceModel

### read data from txt file, and tokenize words, clean-up text

In [2]:
fList = []
with open("Ly Data/lu_ws.txt","r", encoding='utf-8-sig') as files:
     for file in files:
        #simple_preprocess: tokenize,去除標點 - gensim.utils.simple_preprocess(doc, deacc=False, min_len=2, max_len=15)
        fList.append(list(utils.simple_preprocess(file, deacc=True, min_len=2,)))
print("fList[:2] = ", fList[:2])
print("------------------------------")

fList[:2] =  [['呂委員孫綾', '行政院', '院長', '副院長', '部會', '首長', '同仁', '新北市', '選區', '立法委員', '選區', '包括', '淡水', '三芝', '石門', '八里', '林口', '泰山', '地方', '選區', '發電廠', '請問', '院長', '發電場', '性質', '行政院', '院長', '答復', '張院長善政', '委員', '呂委員孫綾', '電廠', '位於', '性質', '張院長善政', '核能', '電廠', '位在', '金山', '石門', '呂委員孫綾', '張院長善政', '金山', '呂委員孫綾', '金山', '選區', '選區', '林口', '泰山', '淡水', '三芝', '石門', '八里', '張院長善政', '林口', '電廠', '核能', '電廠', '呂委員孫綾', '火力發電廠', '張院長善政', '燃煤', '呂委員孫綾', '發電廠', '居民', '區域', '居民', '影響', '本席', '請教', '二號', '除役', '張院長善政', '經濟部長', '部長', '振中', '核發', '年限', '除役', '呂委員孫綾', '部長', '振中', '主任委員', '源卿', '呂委員孫綾', '號機', '二號機', '除役', '部長', '振中', '原能會', '主委', '指正', '呂委員孫綾', '號機', '二號機', '機會', '提前', '除役', '張院長善政', '目前', '台電', '規劃', '去年', '年底', '送出', '除役', '計畫', '部長', '振中', '對不起', '提前', '除役', '貯存', '料棒', '燃料池', '漸漸', '找不出', '方法', '解決', '外運', '放置', '中期', '貯存槽', '被迫', '除役', '年限', '屆滿', '運轉', '呂委員孫綾', '部長', '面臨', '損害', '變形', '瑕疵', '天災', '災變', '部長', '振中', '說到', '天災', '發生', '運轉', '台電', '注意', '原能會', '監督', '細節', '運作', '嚴密', '計畫', '監督'

### creat dictionary for corpus

In [3]:
# creat a unique id for each word segment
dict = corpora.Dictionary(list(fList))
print("dict = ", dict)
print("------------------------------")

dict =  Dictionary(9855 unique tokens: ['google', '七十多', '三芝', '不利', '不合時宜']...)
------------------------------


### creat corpus need for Topic Modeling

In [4]:
# creat mapping matrix of [word_id, word_frequency] for each word segment
corpus = [dict.doc2bow(text) for text in fList]   
print("corpus[:1] = ", corpus[:1])  
print("================================================")

print("print lists of [word:frequency]")
[[print((dict[id], freq)) for id,freq in cp] for cp in corpus[:1]]
print("======================================================")

corpus[:1] =  [[(0, 4), (1, 1), (2, 3), (3, 1), (4, 1), (5, 4), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 2), (16, 4), (17, 1), (18, 2), (19, 1), (20, 1), (21, 6), (22, 2), (23, 1), (24, 2), (25, 1), (26, 5), (27, 5), (28, 2), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 7), (39, 1), (40, 1), (41, 5), (42, 1), (43, 1), (44, 1), (45, 2), (46, 2), (47, 2), (48, 5), (49, 4), (50, 1), (51, 3), (52, 3), (53, 2), (54, 10), (55, 2), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 3), (69, 6), (70, 1), (71, 1), (72, 3), (73, 1), (74, 2), (75, 6), (76, 3), (77, 1), (78, 1), (79, 3), (80, 4), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 59), (89, 1), (90, 1), (91, 2), (92, 1), (93, 2), (94, 1), (95, 1), (96, 1), (97, 2), (98, 7), (99, 1), (100, 3), (101, 3), (102, 1), (103, 1), (104, 1), (105, 5), (106, 14), (107, 1), (108, 1), (

### building the Topic Model from MALLET
#### enviroment setting for Mallet LDA: https://programminghistorian.org/en/lessons/topic-modeling-and-mallet#installing-mallet

In [5]:
mallet_path = "/home/deadfate-sky/miniconda3/envs/idp/bin/mallet"
ldamal = models.wrappers.LdaMallet(mallet_path, corpus=corpus, id2word=dict, num_topics=12)
output = open("KCC Data/NewsCisTest-KccDict2020-MalletNum12.txt", 'w', encoding='utf-8-sig')

#print(ldamal.show_topics(formatted=False))
# 列出最重要的前若干个主题
lsTM = ldamal.show_topics(num_topics=12, num_words=20, log=False, formatted=False)
for tupleTM in lsTM:
        str1 = "".join(str(x) for x in tupleTM)
        #print("type(str1) = ", type(str1))
        str1 += "\n"
        print("strTM = ", str1)
        output.write(str1)
output.close()
 
#  print("------------------------------")
#  print("Perplexity = ", ldamal.log_perplexity(corpus))

print("------------------------------")
cm = models.CoherenceModel(model=ldamal, texts=list(fList), dictionary= dict, coherence= 'c_v')
coher_lda = cm.get_coherence()
print("\n Coherence Score: ", coher_lda)


strTM =  0[('委員', 0.04074654509189343), ('中心', 0.020373272545946716), ('關心', 0.017096452486109134), ('包括', 0.01438951417580852), ('目前', 0.014247043738424277), ('同仁', 0.012964809801966092), ('署長', 0.012537398489813363), ('努力', 0.011967516740276393), ('後續', 0.011397634990739421), ('辦理', 0.010970223678586693), ('裡面', 0.010970223678586693), ('需要', 0.010970223678586693), ('部會', 0.010115401054281237), ('想要', 0.009403048867360023), ('方式', 0.009403048867360023), ('預算', 0.009260578429975781), ('政府', 0.009118107992591537), ('國際', 0.008548226243054567), ('環境', 0.007835874056133353), ('衛福部', 0.007550933181364867)]

strTM =  1[('部長', 0.15458358298382266), ('呂委員孫綾', 0.05542240862792091), ('委員', 0.029958058717795086), ('國勇', 0.027112043139604552), ('相關', 0.014230077890952667), ('計畫', 0.013930497303774715), ('文化', 0.012582384661473937), ('同仁', 0.01078490113840623), ('解決', 0.010035949670461354), ('住戶', 0.009286998202516477), ('溝通', 0.009286998202516477), ('預算', 0.009137207908927502), ('本席', 0.008987417

### Finding the optimal number of topics

In [6]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    # global mallet_path, dict
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dict)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        coher = coherencemodel.get_coherence()
        print("Num_topics: ", num_topics, " Coherence Score= ", coher)
    return model_list, coherence_values

In [7]:
lim=24; sta=3; ste=3;
model_list, coherence_values = compute_coherence_values(dictionary=dict, corpus=corpus, texts=list(fList), limit=lim, start=sta, step=ste)

Num_topics:  3  Coherence Score=  0.37159427035761744
Num_topics:  6  Coherence Score=  0.4398073626584737
Num_topics:  9  Coherence Score=  0.455548194981646
Num_topics:  12  Coherence Score=  0.44167455628456337
Num_topics:  15  Coherence Score=  0.4344170108470281
Num_topics:  18  Coherence Score=  0.4328278099244979
Num_topics:  21  Coherence Score=  0.41231484948267716


In [8]:
# Show graph
import matplotlib.pyplot as plt

limit=lim; start=sta; step=ste;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

<Figure size 640x480 with 1 Axes>

In [9]:
# Print the coherence scores
optNum = 0
maxCV = 0
for m, cv in zip(x, coherence_values):
    # maxCV = lambda cv, maxCV: cv if cv > maxCV else maxCV
    if cv > maxCV:
        optNum = m
        maxCV = cv 
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
print("Optimal Topic Num =", optNum, " Highest Value =", maxCV)

Num Topics = 3  has Coherence Value of 0.3716
Num Topics = 6  has Coherence Value of 0.4398
Num Topics = 9  has Coherence Value of 0.4555
Num Topics = 12  has Coherence Value of 0.4417
Num Topics = 15  has Coherence Value of 0.4344
Num Topics = 18  has Coherence Value of 0.4328
Num Topics = 21  has Coherence Value of 0.4123
Optimal Topic Num = 9  Highest Value = 0.455548194981646


###  Select the model and print the topics

In [10]:
len(model_list)

7

In [11]:
from pprint import pprint

optimal_model = model_list[6]
# model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

[(19,
  '0.029*"呂委員孫綾" + 0.029*"發生" + 0.025*"訊息" + 0.020*"狀況" + 0.019*"系統" + '
  '0.016*"國人" + 0.015*"有沒有" + 0.012*"測試" + 0.012*"聯繫" + 0.012*"災害" + '
  '0.012*"地方" + 0.011*"影響" + 0.011*"政府" + 0.011*"消息" + 0.011*"防災" + 0.010*"委員" '
  '+ 0.010*"台灣" + 0.010*"建立" + 0.010*"需要" + 0.009*"時間"'),
 (5,
  '0.060*"部長" + 0.049*"國軍" + 0.035*"國防部" + 0.032*"呂委員孫綾" + 0.024*"世寬" + '
  '0.021*"委員" + 0.020*"官兵" + 0.020*"女性" + 0.014*"德發" + 0.012*"部隊" + 0.010*"訓練" '
  '+ 0.010*"營區" + 0.009*"機制" + 0.009*"弟兄" + 0.008*"軍官" + 0.008*"募兵制" + '
  '0.007*"學生" + 0.007*"軍事" + 0.006*"軍人" + 0.006*"提到"'),
 (4,
  '0.070*"委員長" + 0.036*"僑委會" + 0.029*"新興" + 0.027*"海外" + 0.023*"呂委員孫綾" + '
  '0.022*"僑胞" + 0.021*"委員" + 0.021*"服務" + 0.016*"台商" + 0.013*"台灣" + 0.012*"成立" '
  '+ 0.012*"僑界" + 0.010*"僑務" + 0.010*"臺灣" + 0.010*"僑校" + 0.008*"副委員長" + '
  '0.008*"組織" + 0.008*"國家" + 0.008*"中心" + 0.008*"協助"'),
 (7,
  '0.042*"醫療" + 0.029*"需要" + 0.020*"資源" + 0.020*"檢討" + 0.018*"提供" + 0.017*"委員" '
  '+ 0.016*"需求" + 0.014*"政府" + 0.013*"努力" + 0

In [14]:
with open('lu_LDA.txt', 'w') as out:
    pprint(optimal_model.print_topics(num_words=20), stream=out)