In [11]:
# coding=utf-8
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
with open('./语料库/语料库/docs/doc_names.txt', 'r', encoding='utf-8', errors='ignore')as fn:
    names = fn.read().splitlines()
train = []
for name in names:
    # 将分完词的文档加载成符合gensim文格式的输入
    # print(name)
    with open('./语料库/语料库/docs/' + name, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f.readlines():
            line = [wnl.lemmatize(word.strip().lower()) for word in line.split() if word.lower() not in stopwords.words('english')]
            train.append(line)

with open('pre_process.txt', 'w', encoding='utf-8') as f:
    for item in train:
        for it in item:
            f.write(it)

In [12]:
# 构造词典
dictionary = corpora.Dictionary(train)
feature_cnt = len(dictionary.token2id)  # 词典中词的数量
dictionary.save('dict.txt')  # 保存生成的词典,用于以后加载
# dictionary=Dictionary.load('dict.txt')#加载词典

# 基于词典，将【分词列表集】转换成【向量集】，形成【语料库】
corpus = [dictionary.doc2bow(text) for text in train]

# 使用【TF-IDF模型】处理语料库
tfidf_model = models.TfidfModel(corpus)

# 打印模型参数：文档数量与语料库单词数
print(tfidf_model)

# 存储通过tf-idf转化过的文档
with open('tfidf_doc.txt', 'w', encoding='utf-8') as fr:
    for doc in tfidf_model[corpus]:
        fr.write(doc.__str__() + '\n')

TfidfModel(num_docs=91446, num_nnz=2011109)


In [None]:
import gensim
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel

coherence = []
# 假设主题数量在1-20中进行选择
for n in range(1, 21):   
    # Multi-core implementation:
    lda = gensim.models.LdaMulticore(corpus, num_topics = n, id2word = dictionary, passes = 10, workers = 2)
    
    # ompute coherence for each lda model with different number of topics
    cohm = CoherenceModel(model = lda, corpus = corpus, dictionary = dictionary, coherence = 'u_mass')
    coh = cohm.get_coherence()
    coherence.append(coh)

In [None]:
print(coherence)
topic = [i for i in range(1,21)]
plt.plot(topic,coherence)
plt.show()

In [13]:
# 利用lda模型对语料库进行建模，设置模型的主题的个数、迭代次数
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, chunksize=1000, iterations=2000)
# 打印LDA模型参数
print(lda)

# 存储LDA文档结果：存储文档-主题分布
with open('doc_dis.txt', 'w') as f1:
    # for doc in lda_corpus:
    for doc in lda[corpus]:
        f1.write(doc.__str__() + '\n')
        f1.flush()
        
# 存储LDA文档结果：存储主题-词分布
with open('topic_dis.txt', 'w') as f2:
    for topic in lda.get_topics():
        f2.write(topic.__str__() + '\n')
        f2.flush()

LdaModel(num_terms=153269, num_topics=5, decay=0.5, chunksize=1000)


In [5]:
for topic in lda.print_topics(num_topics=5, num_words=30):
    print(topic)

(0, '0.039*"zte" + 0.019*"u" + 0.017*"ban" + 0.015*"deal" + 0.012*"company" + 0.012*"bill" + 0.012*"lawmaker" + 0.012*"senate" + 0.011*"department" + 0.011*"chinese" + 0.010*"trump" + 0.010*"billion" + 0.010*"said" + 0.010*"play" + 0.009*"commerce" + 0.009*"25%" + 0.008*"defense" + 0.008*"company\'s" + 0.008*"share" + 0.008*"want" + 0.008*"lost" + 0.008*"first" + 0.008*"still" + 0.008*"trading" + 0.008*"cornyn" + 0.008*"resumed" + 0.008*"measure" + 0.007*"president" + 0.007*"last" + 0.007*"hong"')
(1, '0.006*"wrong?" + 0.006*"play" + 0.006*"actually" + 0.005*"zte" + 0.005*"u" + 0.005*"deal" + 0.005*"billion" + 0.005*"ban" + 0.005*"first" + 0.005*"chinese" + 0.005*"said" + 0.005*"company" + 0.005*"lost" + 0.005*"department" + 0.005*"president" + 0.005*"bill" + 0.005*"senate" + 0.005*"still" + 0.005*"measure" + 0.005*"trump" + 0.005*"lawmaker" + 0.005*"share" + 0.005*"last" + 0.005*"cornyn" + 0.005*"company\'s" + 0.005*"hong" + 0.005*"commerce" + 0.005*"republican" + 0.005*"25%" + 0.005*

In [14]:
# Visualize the topics
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary,mds='mmds')
pyLDAvis.save_html(vis, './vis.html')


  default_term_info = default_term_info.sort_values(
