In [1]:
from IPython.display import display
import pandas as pd

In [2]:
corpus = [
    u'This is the first document.',
    u'This is the second second document.',
    u'And the third one.',
    u'Is this the first document?',]

# 文本转向量

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
res = cv.fit_transform(corpus)

## fit_transform 返回一个压缩稀疏行矩阵CSR（Compressed Sparse Row matrix）

In [4]:
print(type(res))

<class 'scipy.sparse.csr.csr_matrix'>


## CSR压缩形式
（行号，单词序列号） 个数

In [5]:
print(res)

  (0, 1)	1
  (0, 2)	1
  (0, 6)	1
  (0, 3)	1
  (0, 8)	1
  (1, 5)	2
  (1, 1)	1
  (1, 6)	1
  (1, 3)	1
  (1, 8)	1
  (2, 4)	1
  (2, 7)	1
  (2, 0)	1
  (2, 6)	1
  (3, 1)	1
  (3, 2)	1
  (3, 6)	1
  (3, 3)	1
  (3, 8)	1


# 转换为数组形式

In [6]:
print(res.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


## 表格形式展现词频向量

In [7]:
pd.DataFrame(res.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,1,1,1,0,0,1,0,1
1,0,1,0,1,0,2,1,0,1
2,1,0,0,0,1,0,1,1,0
3,0,1,1,1,0,0,1,0,1


## 获取特征词

In [8]:
print(cv.get_feature_names())

[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']


## 表格形式展现特征词

In [9]:
pd.DataFrame(cv.get_feature_names())

Unnamed: 0,0
0,and
1,document
2,first
3,is
4,one
5,second
6,the
7,third
8,this


# 文本向量转TF-IDF向量

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tt = TfidfTransformer()
tt_res = tt.fit_transform(res)

In [11]:
print type(tt_res)

<class 'scipy.sparse.csr.csr_matrix'>


## CSR压缩形式

In [12]:
print(tt_res)

  (0, 8)	0.438776742859
  (0, 3)	0.438776742859
  (0, 6)	0.358728738248
  (0, 2)	0.541976569726
  (0, 1)	0.438776742859
  (1, 8)	0.272301467523
  (1, 3)	0.272301467523
  (1, 6)	0.222624292325
  (1, 1)	0.272301467523
  (1, 5)	0.853225736145
  (2, 6)	0.28847674875
  (2, 0)	0.552805319991
  (2, 7)	0.552805319991
  (2, 4)	0.552805319991
  (3, 8)	0.438776742859
  (3, 3)	0.438776742859
  (3, 6)	0.358728738248
  (3, 2)	0.541976569726
  (3, 1)	0.438776742859


## 转换为数组形式

In [13]:
print(tt_res.toarray())

[[ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]
 [ 0.          0.27230147  0.          0.27230147  0.          0.85322574
   0.22262429  0.          0.27230147]
 [ 0.55280532  0.          0.          0.          0.55280532  0.
   0.28847675  0.55280532  0.        ]
 [ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]]


## 表格形式展现TF-IDF结果

In [14]:
pd.DataFrame(tt_res.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777
1,0.0,0.272301,0.0,0.272301,0.0,0.853226,0.222624,0.0,0.272301
2,0.552805,0.0,0.0,0.0,0.552805,0.0,0.288477,0.552805,0.0
3,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777


# 余弦相似性计算

## 计算余弦相似性

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
cm = cosine_similarity(tt_res)
print cm

[[ 1.          0.43830038  0.1034849   1.        ]
 [ 0.43830038  1.          0.06422193  0.43830038]
 [ 0.1034849   0.06422193  1.          0.1034849 ]
 [ 1.          0.43830038  0.1034849   1.        ]]


## 表格展示余弦相似性输出

In [16]:
pd.DataFrame(cm)

Unnamed: 0,0,1,2,3
0,1.0,0.4383,0.103485,1.0
1,0.4383,1.0,0.064222,0.4383
2,0.103485,0.064222,1.0,0.103485
3,1.0,0.4383,0.103485,1.0


## cosine_similarity返回数据说明
>在不指定y参数的情况下，cosine_similarity方法返回的矩阵是N×N的方阵，每个元素为即两两之间余弦相似值。

以上面的4句话的数据为例，即4×4的矩阵，如上表，(0,0)表示第0个句子和第0个句子的相似性，(0,1)表示第0个句子和第一个句子的相似性。可以看出来左上角到右下角对角线的数据都是1，且对角线上下的数据是对称的，这部分数据其实是冗余的。

## 普通矩阵转稀疏矩阵

In [17]:
from scipy.sparse.csr import csr_matrix
sparse_matrix = csr_matrix(cosine_similarity(tt_res))
print sparse_matrix

  (0, 0)	1.0
  (0, 1)	0.438300384476
  (0, 2)	0.103484900093
  (0, 3)	1.0
  (1, 0)	0.438300384476
  (1, 1)	1.0
  (1, 2)	0.0642219320427
  (1, 3)	0.438300384476
  (2, 0)	0.103484900093
  (2, 1)	0.0642219320427
  (2, 2)	1.0
  (2, 3)	0.103484900093
  (3, 0)	1.0
  (3, 1)	0.438300384476
  (3, 2)	0.103484900093
  (3, 3)	1.0


**稀疏矩阵的输出非常清晰的说明了两两之间相似性的关系**。

# gensim

In [18]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import random

In [19]:
def taggedDocIter(corpus):
    for i,line in enumerate(corpus):
        yield TaggedDocument(line.split(), [i])

In [20]:
train_corpus = list(taggedDocIter(corpus))
model = Doc2Vec(size=50, min_count=2, iter=55) #Doc2Vec(size=100, window=8, min_count=5, workers=4)
model.build_vocab(train_corpus)
train_corpus

[TaggedDocument(words=[u'This', u'is', u'the', u'first', u'document.'], tags=[0]),
 TaggedDocument(words=[u'This', u'is', u'the', u'second', u'second', u'document.'], tags=[1]),
 TaggedDocument(words=[u'And', u'the', u'third', u'one.'], tags=[2]),
 TaggedDocument(words=[u'Is', u'this', u'the', u'first', u'document?'], tags=[3])]

In [21]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 5.71 ms, sys: 242 µs, total: 5.95 ms
Wall time: 6.5 ms


271

In [22]:
doc_id = random.randint(0, len(train_corpus) - 1)
print doc_id

1


In [23]:
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
print inferred_vector

[ 0.00987491  0.00759193 -0.00196348 -0.00213921 -0.00819018 -0.00702452
  0.0046486  -0.00350797  0.00709473 -0.00676888  0.00780597  0.00440009
 -0.00864802  0.0083712   0.00894532 -0.00770911 -0.00255257 -0.00065849
  0.00309537  0.00484167  0.00545639  0.00943009  0.0081901   0.00828644
 -0.0048387  -0.0033142   0.00466975  0.00770591 -0.00731411  0.00593066
 -0.00767057  0.00571141 -0.00755292  0.00237359  0.00531316 -0.00571028
 -0.00480762 -0.0001867  -0.00691046  0.00775944  0.00835648  0.00318755
 -0.0095835   0.00569818  0.00778661  0.00880567  0.00699712 -0.00194759
  0.00317679 -0.00142976]


In [24]:
sims = model.docvecs.most_similar([inferred_vector], topn=3)
print sims

[(3, 0.2852686643600464), (2, 0.05049079656600952), (1, 0.04782940819859505)]


In [25]:
print "ORIGIN: {}".format(" ".join(train_corpus[doc_id].words))
for i,sim in enumerate(sims):
    print "SIMILAR {}: {}".format(i+1, " ".join(train_corpus[sim[0]].words))

ORIGIN: This is the second second document.
SIMILAR 1: Is this the first document?
SIMILAR 2: And the third one.
SIMILAR 3: This is the second second document.


In [26]:
model.docvecs.similarity(0,3)

0.072604476290183906