In [1]:
from gensim.test.utils import common_texts
from gensim.models import TfidfModel, LdaModel, LsiModel
from gensim.corpora import Dictionary

# 一、数据加载

In [2]:
# 构建字典
common_dictionary = Dictionary(common_texts)
# 各个文本对应的词袋法的值
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print("原始数据:\n{}".format(common_texts))
print("\n词袋法后的值:\n{}".format(common_corpus))
print(f"文本数目:{len(common_texts)}")
print(f"去重后单词数目:{len(common_dictionary)}")

原始数据:
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

词袋法后的值:
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
文本数目:9
去重后单词数目:12


In [3]:
other_texts = [
    ['computer', 'time', 'graph'],  # 文本1
    ['survey', 'response', 'eps'],  # 文本2
    ['human', 'system', 'computer']  # 文本3
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print("测试数据对应的词袋法的值:\n{}".format(other_corpus))

测试数据对应的词袋法的值:
[[(0, 1), (6, 1), (10, 1)], [(3, 1), (4, 1), (8, 1)], [(0, 1), (1, 1), (5, 1)]]


# 二、TF-IDF Model

In [4]:
# 模型构建
model = TfidfModel(corpus=common_corpus)

In [5]:
# 预测
vectors = model[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.6282580468670046), (6, 0.6282580468670046), (10, 0.45889394536615247)]
[(3, 0.5773502691896257), (4, 0.5773502691896257), (8, 0.5773502691896257)]
[(0, 0.6282580468670046), (1, 0.6282580468670046), (5, 0.45889394536615247)]


# 三、LDA Model

In [6]:
# 模型构建&训练
model = LdaModel(common_corpus, num_topics=4)

In [7]:
# 模型保存
model.save('./datas/lda_model.pkl')

In [8]:
# 模型加载
lda = LdaModel.load('./datas/lda_model.pkl')

In [9]:
# 模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.069767214), (1, 0.30470115), (2, 0.55993336), (3, 0.06559832)]
[(0, 0.067352876), (1, 0.068941414), (2, 0.5631249), (3, 0.3005808)]
[(0, 0.0669485), (1, 0.06265688), (2, 0.065133065), (3, 0.80526155)]


In [10]:
# 更新模型（在当前模型基础上继续更新模型参数）
lda.update(other_corpus)

In [11]:
# 更新后模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.30812675), (1, 0.064296484), (2, 0.5623311), (3, 0.06524563)]
[(0, 0.06380742), (1, 0.064043365), (2, 0.57102084), (3, 0.30112836)]
[(0, 0.06360766), (1, 0.06259746), (2, 0.06405354), (3, 0.8097414)]


# 四、Other

官网文档：https://radimrehurek.com/gensim/apiref.html

In [12]:
print("各个单词对应的主题向量:")
word_embedding_tabel = model.get_topics().T
print(type(word_embedding_tabel))
print(word_embedding_tabel.shape)
print(word_embedding_tabel)

各个单词对应的主题向量:
<class 'numpy.ndarray'>
(12, 4)
[[0.02045472 0.04305287 0.10296427 0.12342904]
 [0.0964063  0.04281437 0.02170265 0.12337951]
 [0.02024666 0.0424786  0.02147618 0.22152357]
 [0.0201632  0.04322479 0.18441567 0.02626349]
 [0.02011306 0.20281161 0.10390142 0.02570693]
 [0.17394419 0.04368602 0.10411036 0.12415506]
 [0.02017469 0.04318488 0.18489635 0.02569816]
 [0.02039121 0.0436511  0.18560314 0.12395196]
 [0.09606148 0.04251812 0.02205596 0.12357894]
 [0.24436422 0.04705997 0.02380019 0.02820649]
 [0.17133532 0.2025251  0.02327944 0.02805221]
 [0.09634493 0.20299254 0.02179429 0.0260547 ]]


In [13]:
common_dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}