## Load packages and data

In [1]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bertopic import BERTopic

# Gensim
import gensim
from gensim.corpora import Dictionary
from gensim import corpora
from gensim.models.ldamodel import LdaModel

# Scipy
import scipy.sparse as sp
from scipy.sparse import csr_matrix


# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Visualization
import pyLDAvis
import pyLDAvis.sklearn

2023-03-13 11:08:55.968014: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 讀入document-term dataframe
res = pd.read_csv("/Users/deankuo/Desktop/python/dissertation_replicate/Catalinac_TDM_new.csv", encoding="UTF-8")
# remove the first column 'Term' from the data frame
res2 = res.iloc[:, 1:].values

# set row names as the values of the first column
row_names = res.iloc[:, 0].values
res2 = pd.DataFrame(res2, index=row_names) # type: ignore
res2 = res2.transpose()
res2

Unnamed: 0,政治,日本,社会,実現,改革,国民,教育,企業,年金,ひと,...,都会,度目,党員,灯,難問,爆発,本質,無法,夜明け,郎
0,1,1,1,0,2,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,2,3,0,0,4,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,16,0,3,2,0,4,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,2,4,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,0,2,0,5,1,2,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7492,3,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7493,0,3,0,0,1,0,7,1,3,1,...,0,0,0,0,0,0,0,0,0,0
7494,0,1,1,2,0,1,1,0,3,2,...,0,0,0,0,0,0,0,0,0,0
7495,0,0,0,2,2,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


## Transfer to applicable data type (sparse matrix)

In [3]:
def create_bow_from_sparse_df(df):
    bow = []
    for i in range(df.shape[0]):
        doc_words = ''
        for j, val in zip(df.columns, df.iloc[i]):
            if val > 0:
                doc_words += (j + ' ') * val
        bow.append(doc_words)
    return bow

df_bow = create_bow_from_sparse_df(res2)
print(df_bow[0]) # 1986 愛知県第１区 田辺広雄 Manifesto

政治 日本 社会 改革 改革 企業 企業 経済 経済 目指す 地域 充実 平和 支援 安心 中小 中小 政策 作り 子供 推進 保障 世界 高齢 整備 確立 産業 地方 豊か 問題 国際 国際 国際 安全 振興 働く 道路 議員 活性 行政 行政 中心 外交 確保 減税 建設 まち まち まち 国政 年間 年間 文化 文化 文化 努力 努力 努力 努力 交通 育てる 育てる 育てる 都市 解決 経験 希望 基盤 住む 住む 住む 住む 住む 住む 持てる 年寄り 地元 生きる 網 網 地場 経営 東京 役割 明るい 生きがい 生きがい 公平 果たす 力強い 立候補 明日 青少年 スポーツ 圏 圏 圏 圏 誇り 情熱 交流 展望 センター 備える 早急 愛する 会議 十分 通じる 広い 大阪 楽しい 役に立つ のびのび 中核 独自 中部 中部 中部 喜び パイプ役 捧げる 空 携わる 併せる グループ お世話 その間 好き 造り 昨今 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 名古屋 意義 日頃 


In [4]:
vectorizer = CountVectorizer(analyzer='word')
data_vectorized = vectorizer.fit_transform(df_bow)

In [5]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=69,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_model.fit(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=69, n_jobs=-1,
                          random_state=100)


In [6]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -8392825.579108145
Perplexity:  930.5941710688556
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 69,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [13]:
# Create Document — Topic Matrix
lda_output = lda_model.transform(data_vectorized) # type: ignore
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)] # type: ignore
# index names
docnames = [name for name in res.columns[1:]]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) # type: ignore
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .5 else 'black'
    return 'color: {col}'.format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50,Topic51,Topic52,Topic53,Topic54,Topic55,Topic56,Topic57,Topic58,Topic59,Topic60,Topic61,Topic62,Topic63,Topic64,Topic65,Topic66,Topic67,Topic68,dominant_topic
X1986.1.愛知県第１区ﾀﾅﾍﾞﾋﾛｵ田辺広雄.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,0.1,0.0,0.45,0.0,0.0,0.0,0.0,0.0,63
X1986.10.愛知県第２区ｸｻｶﾜｼｮｳｿﾞｳ草川昭三.txt,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.13,0.0,0.06,0.0,0.0,0.0,0.0,0.0,27
X1986.100.愛媛県第２区ﾑﾗｶﾐｾｲｲﾁﾛｳ村上誠一郎.txt,0.0,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32
X1986.101.愛媛県第２区ｵﾁｲﾍｲ越智伊平.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,57
X1986.102.愛媛県第２区ｵｵｺｳﾁｲﾁﾛｳ大河内一郎.txt,0.0,0.11,0.0,0.0,0.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
X1986.103.愛媛県第２区ﾌｼﾞﾀﾀｶﾄｼ藤田高敏.txt,0.0,0.68,0.0,0.0,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.06,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,1
X1986.104.愛媛県第２区ﾓﾘｷﾖｼ森清.txt,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.2,0.0,0.19,0.0,0.0,0.0,0.0,0.01,0.4,0.0,0.0,0.0,0.0,0.0,63
X1986.105.愛媛県第３区ｲﾅｶﾞｷﾄﾖﾋｺいながき豊彦.txt,0.02,0.29,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
X1986.106.愛媛県第３区ﾆｼﾀﾞﾏﾓﾙ西田司.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.0,0.0,57
X1986.107.愛媛県第３区ﾀﾅｶﾂﾈﾄｼ田中恒利.txt,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.15,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,1


In [8]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,57,1829
1,27,965
2,55,845
3,19,813
4,22,676
5,4,657
6,52,422
7,29,294
8,1,216
9,8,206


In [9]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_) # type: ignore

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head(10)

Unnamed: 0,com,fta,gdp,it,jcp,jr,npo,oda,pko,up,...,高等,高級,高速,高額,高騰,高齢,魅力,鳩山,鹿児島,黒字
Topic0,0.014493,0.014493,0.014493,0.014493,0.014496,0.015964,0.014493,0.014494,0.015357,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.014497,0.014493,0.014493,0.014493,0.014493
Topic1,0.014493,0.014493,23.450334,0.014493,0.014493,0.014493,0.014493,0.014498,0.022501,1.770248,...,0.014493,0.014493,0.014877,0.0145,0.014493,13.653132,0.014493,0.014493,0.014493,0.014493
Topic2,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493
Topic3,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493
Topic4,0.014493,0.014493,0.014493,0.014493,0.014644,0.014493,0.014493,0.014493,0.014493,0.014493,...,0.014493,0.014493,0.01458,0.014493,0.014493,0.014525,0.014493,0.014493,0.016656,0.014493
Topic5,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493
Topic6,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493
Topic7,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014509,0.014493,...,0.014493,0.014493,0.014493,0.014493,0.014493,0.019193,0.014493,0.014493,0.014493,0.014493
Topic8,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014493,0.014502,...,0.014493,0.014493,0.015389,0.014494,0.014493,228.844279,0.014493,0.014493,0.014493,0.014494
Topic9,0.014493,0.014493,0.014493,0.014493,0.014494,0.014493,0.014541,0.014576,0.014493,0.014591,...,0.014493,0.014493,92.225941,0.014493,0.014493,0.016685,0.014493,0.014493,0.014493,0.014493


In [10]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_keywords=15):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_keywords]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

# Topic - Keywords Dataframe
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_keywords=15)
df_topic_keywords = pd.DataFrame(topic_keywords).transpose()
df_topic_keywords.index = ['Word ' + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.columns = ['Topic ' + str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,...,Topic 59,Topic 60,Topic 61,Topic 62,Topic 63,Topic 64,Topic 65,Topic 66,Topic 67,Topic 68
Word 0,暮らし,政治,郵便,税調,日本,報酬,徴収,知恵,政治,日本海,...,身勝手,消費,作り,増税,努力,使い道,中曽根,プラス,政治,当たり前
Word 1,予算,自民党,契約,弱肉強食,共産党,野放し,政治,先生,禁止,集中,...,おかしい,国民,地域,無駄,選挙,合計,福祉,白紙,行動,山梨
Word 2,政治,政権,障る,自公,国民,諫早,建設,立て直し,企業,新潟,...,切り替える,反対,ひと,事業,郷土,jcp,同日,流域,社会,福岡
Word 3,自民党,選挙,支払,退廃,政治,発達,振興,三位一体,献金,産業,...,出店,自由,まち,公共,対策,金額,政治,北九州,作る,太陽光
Word 4,残業,国民,据え置く,東アジア,守る,かばう,全力,東海,廃止,国土,...,すえおく,政治,子供,税金,企業,資料,選挙,被災,放棄,オリンピック
Word 5,中小,実現,くみ,空き,企業,退廃,目指す,埼玉,国民,移行,...,浪費,廃止,安心,ルール,農林,自公,政策,花開く,もの,風力
Word 6,サービス,減税,新進党,おくれる,自民党,自公,選挙,岐阜,政権,新幹線,...,民主党,民主,作る,財政,問題,逆立ち,社会,空前,解決,欠陥
Word 7,正す,生活,据え置き,得る,平和,マニフェスト,使い道,知育,無料,北陸,...,区長,守る,産業,日本,発展,イラク,年金,欠陥,ひと,実験
Word 8,規制,中曽根,育ち,不安,反対,可動,教育,体育,金権,富山,...,無党派,農業,いかす,作る,つとめる,無償,拡大,つくれる,重要,短縮
Word 9,日本,腐敗,節約,ボランティア,やめる,人件,申し上げる,体力,団体,高速,...,新進党,税率,未来,再建,図る,年余,確立,通用,デフレ,決着


In [11]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
vis