In [17]:
import numpy as np
from gensim import corpora, models, similarities
import jieba
import lda
import time
import warnings

In [18]:
def load_stopword():
    f_stop = open('stopword.txt')
    sw = [line.strip() for line in f_stop]
    f_stop.close()
    return sw

In [19]:
warnings.filterwarnings('ignore', category=RuntimeWarning)
print('初始化停止词列表 --')
t_start = time.time()
stop_words = load_stopword()

print('开始读入语料数据 -- ')
f = open('news.dat', encoding='utf-8')    #LDA_test.txt
texts = [[word for word in line.strip().lower().split() if word not in stop_words] for line in f]
# texts = [line.strip().split() for line in f]
print('读入语料数据完成，用时%.3f秒' % (time.time() - t_start))
f.close()
M = len(texts)
print('文本数目：%d个' % M)
# pprint(texts)

print('正在建立词典 --')
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
print('词的个数：', V)
print('正在计算文本向量 --')
corpus = [dictionary.doc2bow(text) for text in texts]
print('正在计算文档TF-IDF --')
t_start = time.time()
corpus_tfidf = models.TfidfModel(corpus)[corpus]
print('建立文档TF-IDF完成，用时%.3f秒' % (time.time() - t_start))
print('LDA模型拟合推断 --')
num_topics = 10
t_start = time.time()
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
    alpha=0.01, eta=0.01, minimum_probability=0.001,
    update_every = 1, chunksize = 100, passes = 1)
print('LDA模型完成，训练时间为\t%.3f秒' % (time.time() - t_start))
# 所有文档的主题
# 随机打印某10个文档的主题
num_show_topic = 10  # 每个文档显示前几个主题
print('10个文档的主题分布：')
doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
idx = np.arange(M)
np.random.shuffle(idx)
idx = idx[:10]
for i in idx:
    topic = np.array(doc_topics[i])
    print('topic = \t', topic)
    topic_distribute = np.array(topic[:, 1])
    # print topic_distribute
    topic_idx = topic_distribute.argsort()[:-num_show_topic-1:-1]
    print(('第%d个文档的前%d个主题：' % (i, num_show_topic)), topic_idx)
    print(topic_distribute[topic_idx])
num_show_term = 7   # 每个主题显示几个词
print('每个主题的词分布：')
for topic_id in range(num_topics):
    print('主题#%d：\t' % topic_id)
    term_distribute_all = lda.get_topic_terms(topicid=topic_id)
    term_distribute = term_distribute_all[:num_show_term]
    term_distribute = np.array(term_distribute)
    term_id = term_distribute[:, 0].astype(np.int)
    print('词：\t', end=' ')
    for t in term_id:
        print(dictionary.id2token[t], end=' ')
    print()

初始化停止词列表 --
开始读入语料数据 -- 
读入语料数据完成，用时13.506秒
文本数目：2043个
正在建立词典 --
词的个数： 63871
正在计算文本向量 --
正在计算文档TF-IDF --
建立文档TF-IDF完成，用时0.272秒
LDA模型拟合推断 --
LDA模型完成，训练时间为	7.542秒
10个文档的主题分布：
topic = 	 [[0.00000000e+00 1.96805992e-03]
 [1.00000000e+00 3.23084086e-01]
 [2.00000000e+00 1.96805992e-03]
 [3.00000000e+00 5.10830939e-01]
 [4.00000000e+00 1.96805992e-03]
 [5.00000000e+00 1.96805992e-03]
 [6.00000000e+00 1.96805992e-03]
 [7.00000000e+00 1.52308598e-01]
 [8.00000000e+00 1.96805992e-03]
 [9.00000000e+00 1.96805992e-03]]
第601个文档的前10个主题： [3 1 7 9 8 6 5 4 2 0]
[0.51083094 0.32308409 0.1523086  0.00196806 0.00196806 0.00196806
 0.00196806 0.00196806 0.00196806 0.00196806]
topic = 	 [[0.00000000e+00 1.92962878e-03]
 [1.00000000e+00 3.99934411e-01]
 [2.00000000e+00 7.76924491e-02]
 [3.00000000e+00 1.92962878e-03]
 [4.00000000e+00 1.92962878e-03]
 [5.00000000e+00 4.51218300e-02]
 [6.00000000e+00 1.92962878e-03]
 [7.00000000e+00 4.65673536e-01]
 [8.00000000e+00 1.92962878e-03]
 [9.00000000e+00 1.92962878e