In [2]:
from gensim import corpora
from pprint import pprint
documents = ["你好 好的 不错 笨蛋",
             "笨蛋 傻瓜 傻子 哈哈",
             "好的 不错 你好 哈哈",
             "有趣 可以 好的 不错 还行",
             "傻瓜 傻子 二货 还行",
             "可以 好的 不错 哈哈",
             "有趣 有趣 哈哈 哈哈"]
texts = [[word for word in document.split()] for document in documents]
pprint(texts)

[['你好', '好的', '不错', '笨蛋'],
 ['笨蛋', '傻瓜', '傻子', '哈哈'],
 ['好的', '不错', '你好', '哈哈'],
 ['有趣', '可以', '好的', '不错', '还行'],
 ['傻瓜', '傻子', '二货', '还行'],
 ['可以', '好的', '不错', '哈哈'],
 ['有趣', '有趣', '哈哈', '哈哈']]


In [3]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
pprint(texts)

[['你好', '好的', '不错', '笨蛋'],
 ['笨蛋', '傻瓜', '傻子', '哈哈'],
 ['好的', '不错', '你好', '哈哈'],
 ['有趣', '可以', '好的', '不错', '还行'],
 ['傻瓜', '傻子', '还行'],
 ['可以', '好的', '不错', '哈哈'],
 ['有趣', '有趣', '哈哈', '哈哈']]


In [4]:
dictionary = corpora.Dictionary(texts)
dictionary.save('sample.dict')
print(dictionary)

Dictionary(10 unique tokens: ['不错', '你好', '好的', '笨蛋', '傻子']...)


In [5]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('sample.mm', corpus)
pprint(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(3, 1), (4, 1), (5, 1), (6, 1)],
 [(0, 1), (1, 1), (2, 1), (6, 1)],
 [(0, 1), (2, 1), (7, 1), (8, 1), (9, 1)],
 [(4, 1), (5, 1), (9, 1)],
 [(0, 1), (2, 1), (6, 1), (7, 1)],
 [(6, 2), (8, 2)]]


In [6]:
from gensim import models, similarities
tf_idf = models.TfidfModel(corpus)
vec = [(0, 1), (5, 1), (7, 1)]
print(tf_idf[vec])

[(0, 0.3011997233053068), (5, 0.6742695034927825), (7, 0.6742695034927825)]


In [31]:
index = similarities.SparseMatrixSimilarity(tf_idf[corpus], num_features=10)
sims = index[tf_idf[vec]]
print(sims)

[0.17764111 0.5362396  0.18628117 0.13041778 0.3730879  0.26369086
 0.        ]


In [33]:
print(index[tf_idf[corpus]])

[[1.         0.12707432 0.8971963  0.3687666  0.         0.7456068
  0.        ]
 [0.12707432 1.         0.04964618 0.         0.58897054 0.05535543
  0.05321141]
 [0.8971963  0.04964618 0.99999994 0.38670254 0.         0.8478415
  0.06341496]
 [0.3687666  0.         0.38670254 1.         0.55752987 0.45820582
  0.47954306]
 [0.         0.58897054 0.         0.55752987 1.         0.
  0.        ]
 [0.7456068  0.05535543 0.8478415  0.45820582 0.         1.
  0.0707076 ]
 [0.         0.05321141 0.06341496 0.47954306 0.         0.0707076
  1.        ]]
