In [1]:
# 安装方式: pip install gensim -i https://pypi.tuna.tsinghua.edu.cn/simple/
import numpy as np
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

### 字典演示

In [2]:
import jieba
text1 = "我是来自湖南张家界的小明，我喜好大海\n我从事IT相关工作\n我讨厌夏天"
text2 = "计算机视觉和自然语言我比较喜好自然语言的内容"
text3 = "我不想上班，我想出去玩"

# 正常的文本构建(针对每个文本进行分词)
docs = []
for text in [text1, text2, text3]:
    docs.append(list(jieba.lcut(text.replace('\n', ''))))
# 构建词典
dct = Dictionary(docs)
print(f"去重后单词数目/词典大小:{len(dct)}")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HP\AppData\Local\Temp\jieba.cache
Loading model cost 1.917 seconds.
Prefix dict has been built successfully.


去重后单词数目/词典大小:26


In [3]:
dct.save_as_text('./datas/a.txt')
print(dct.token2id['上班'])
print(dct[22])
print(dct[0])

22
上班
IT


In [4]:
n = len(dct) + 1 # 词典大小 = 实际词典大小 + 1
text4 = "我是来自北京的小明"
text4_words = list(jieba.lcut(text4.replace('\n', '')))
result = list(np.asarray(dct.doc2idx(text4_words, unknown_word_index=-1)) + 1)
print(text4_words)
print("序号化结果:")
print(result)
print("OneHot结果:")
result2 = [[0] * n for _ in range(len(result))]
for i,_id in enumerate(result):
    if _id != -1:
        result2[i][_id] = 1
print(result2)
print("词袋法结果:")
result3 = list(np.sum(np.asarray(result2), 0))
print(result3)

['我', '是', '来自', '北京', '的', '小明']
序号化结果:
[9, 10, 11, 0, 13, 6]
OneHot结果:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
词袋法结果:
[1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
text4 = "我是来自北京的小明，我喜好玩游戏"
text4_words = list(jieba.lcut(text4.replace('\n', '')))
result = dct.doc2idx(text4_words)
print(text4_words)
print("序号化结果:")
print(result)
print("OneHot结果:")
result2 = [[0] * len(dct) for _ in range(len(result))]
for i,_id in enumerate(result):
    if _id != -1:
        result2[i][_id] = 1
print(result2)
print("词袋法结果:")
result3 = list(np.sum(np.asarray(result2), 0))
print(result3)
result4 = dct.doc2bow(text4_words)
print(result4)

['我', '是', '来自', '北京', '的', '小明', '，', '我', '喜好', '玩游戏']
序号化结果:
[8, 9, 10, -1, 12, 5, 15, 8, 2, -1]
OneHot结果:
[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
词袋法结果:
[0, 0, 1, 0, 0, 1, 0, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[(2

In [6]:
corpus = [dct.doc2bow(doc) for doc in docs]
model = TfidfModel(corpus=corpus)
print("维度大小:{}".format(np.shape(model[corpus[0]])))
model[corpus[0]]

维度大小:(15, 2)


[(0, 0.28388205020418034),
 (1, 0.28388205020418034),
 (2, 0.10477241822549672),
 (3, 0.28388205020418034),
 (4, 0.28388205020418034),
 (5, 0.28388205020418034),
 (6, 0.28388205020418034),
 (7, 0.28388205020418034),
 (9, 0.28388205020418034),
 (10, 0.28388205020418034),
 (11, 0.28388205020418034),
 (12, 0.10477241822549672),
 (13, 0.28388205020418034),
 (14, 0.28388205020418034),
 (15, 0.10477241822549672)]

# 一、加载数据(数据预处理)

In [7]:
# 加载数据
with open('./datas/text8', 'r', encoding='utf-8') as reader:
    content = reader.read()
# 划分单词，并转换为二进制形式
words = list(map(lambda word: word.encode("utf-8"), filter(lambda t: t.strip(), content.split(" "))))
total_words = len(words)
print("总单词数目:{}".format(total_words))
print("【前10个单词】:{}".format(words[:10]))
# 将其转换为文档的形式(必须， 也就是一个文档存在多个单词)
# 模拟的方式：模拟多个文档
word_per_doc = 10000
docs = []
for i in range(total_words // word_per_doc + 1):
    # 获取索引
    start_idx = i * word_per_doc
    end_idx = start_idx + word_per_doc
    # 获取对应的单词列表
    tmp_words = words[start_idx:end_idx]
    # 保存
    if len(tmp_words) > 0:
        docs.append(tmp_words)
print("总文档数目:{}".format(len(docs)))

总单词数目:17005207
【前10个单词】:[b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against']
总文档数目:1701


# 二、构建词典

In [8]:
# 构建词典
# docs中必须是文档，文档内必须是一个一个的单词
# eg: docs --> list(list(str)) --> [['a', 'bv', 'c'], ['a', 'c'], ['d', 'f', ' f']]
dct = Dictionary(docs)
print(f"词典大小:{len(dct)}")
print(f"{len(dct.token2id)}")

词典大小:253854
253854


# 三、BOW词袋法转换

In [9]:
# 做一个词袋法转换(以dct中找到的单词作为特征属性，以文本中出现的数量作为特征值)
corpus = [dct.doc2bow(line) for line in docs]

# 四、TF-IDF构建

In [10]:
model = TfidfModel(corpus=corpus)

# 五、TF-IDF应用

In [11]:
print("维度大小:{}".format(np.shape(model[corpus[0]])))
model[corpus[0]]

维度大小:(2505, 2)


[(1, 0.006704047545684609),
 (2, 0.0030255603220721273),
 (3, 0.003156168449586299),
 (4, 0.0036673470201144674),
 (5, 0.004575122435127926),
 (6, 0.0028052608258295926),
 (7, 0.004064820137019515),
 (8, 0.00014963587508918375),
 (9, 0.0007492665180478759),
 (10, 0.004142807322609117),
 (11, 0.004149816941645728),
 (12, 0.0077498817493309525),
 (13, 0.00656024165742503),
 (14, 0.003891486499758776),
 (15, 0.005476877392392166),
 (16, 0.0018233938817994433),
 (17, 0.0032209070754237084),
 (18, 0.0017737283389229173),
 (19, 0.0023373507198140124),
 (20, 0.003725514968930464),
 (21, 0.00590342512385848),
 (22, 0.003072401062545206),
 (23, 0.0006668171096292247),
 (24, 0.0017594266221832493),
 (25, 0.004202080158963513),
 (26, 0.002967397324595724),
 (27, 0.004709756138185673),
 (28, 0.0014819657487289912),
 (29, 0.0031562459553171694),
 (30, 0.0031999829254611097),
 (31, 0.001215574949729317),
 (32, 0.003843126241898761),
 (33, 0.006499414537896336),
 (34, 0.004546489373863172),
 (35, 0.0

In [12]:
# 针对其它字符串进行词向量转换
others = [
    ['my', 'name','name', 'is', 'gerry'],
    ['my', 'name', 'is', 'xiaoming']
]
other_corpus = [dct.doc2bow(line) for line in others]
vectors = model[other_corpus]
for vector in vectors:
    print(vector)

[(1215, 0.00015939590460450057), (1480, 0.17345090437798794), (1485, 0.024112029486193044), (19266, 0.9845472910924394)]
[(1215, 0.0009167561950501072), (1480, 0.9975927017705476), (1485, 0.0693394615801007)]
