In [25]:
from IPython.display import display
import pandas as pd

# 文本转向量

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
res = cv.fit_transform([
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',])

## fit_transform 返回一个压缩稀疏行矩阵CSR（Compressed Sparse Row matrix）

In [13]:
print(type(res))

<class 'scipy.sparse.csr.csr_matrix'>


## CSR压缩形式
（行号，单词序列号） 个数

In [8]:
print(res)

  (0, 1)	1
  (0, 2)	1
  (0, 6)	1
  (0, 3)	1
  (0, 8)	1
  (1, 5)	2
  (1, 1)	1
  (1, 6)	1
  (1, 3)	1
  (1, 8)	1
  (2, 4)	1
  (2, 7)	1
  (2, 0)	1
  (2, 6)	1
  (3, 1)	1
  (3, 2)	1
  (3, 6)	1
  (3, 3)	1
  (3, 8)	1


# 转换为数组形式

In [11]:
print(res.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


## 表格形式展现词频向量

In [28]:
pd.DataFrame(res.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,1,1,1,0,0,1,0,1
1,0,1,0,1,0,2,1,0,1
2,1,0,0,0,1,0,1,1,0
3,0,1,1,1,0,0,1,0,1


## 获取特征词

In [52]:
print(cv.get_feature_names())

[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']


## 表格形式展现特征词

In [51]:
pd.DataFrame(cv.get_feature_names())

Unnamed: 0,0
0,and
1,document
2,first
3,is
4,one
5,second
6,the
7,third
8,this


# 文本向量转TF-IDF向量

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
tt = TfidfTransformer()
tt_res = tt.fit_transform(res)

In [20]:
print type(tt_res)

<class 'scipy.sparse.csr.csr_matrix'>


## CSR压缩形式

In [19]:
print(tt_res)

  (0, 8)	0.438776742859
  (0, 3)	0.438776742859
  (0, 6)	0.358728738248
  (0, 2)	0.541976569726
  (0, 1)	0.438776742859
  (1, 8)	0.272301467523
  (1, 3)	0.272301467523
  (1, 6)	0.222624292325
  (1, 1)	0.272301467523
  (1, 5)	0.853225736145
  (2, 6)	0.28847674875
  (2, 0)	0.552805319991
  (2, 7)	0.552805319991
  (2, 4)	0.552805319991
  (3, 8)	0.438776742859
  (3, 3)	0.438776742859
  (3, 6)	0.358728738248
  (3, 2)	0.541976569726
  (3, 1)	0.438776742859


## 转换为数组形式

In [24]:
print(tt_res.toarray())

[[ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]
 [ 0.          0.27230147  0.          0.27230147  0.          0.85322574
   0.22262429  0.          0.27230147]
 [ 0.55280532  0.          0.          0.          0.55280532  0.
   0.28847675  0.55280532  0.        ]
 [ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]]


## 表格形式展现TF-IDF结果

In [26]:
pd.DataFrame(tt_res.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777
1,0.0,0.272301,0.0,0.272301,0.0,0.853226,0.222624,0.0,0.272301
2,0.552805,0.0,0.0,0.0,0.552805,0.0,0.288477,0.552805,0.0
3,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777


# 余弦相似性计算

## 计算余弦相似性

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
cm = cosine_similarity(tt_res)
print cm

[[ 1.          0.43830038  0.1034849   1.        ]
 [ 0.43830038  1.          0.06422193  0.43830038]
 [ 0.1034849   0.06422193  1.          0.1034849 ]
 [ 1.          0.43830038  0.1034849   1.        ]]


## 表格展示余弦相似性输出

In [82]:
pd.DataFrame(cm)

Unnamed: 0,0,1,2,3
0,1.0,0.4383,0.103485,1.0
1,0.4383,1.0,0.064222,0.4383
2,0.103485,0.064222,1.0,0.103485
3,1.0,0.4383,0.103485,1.0


## cosine_similarity返回数据说明
>在不指定y参数的情况下，cosine_similarity方法返回的矩阵是N×N的方阵，每个元素为即两两之间余弦相似值。

以上面的4句话的数据为例，即4×4的矩阵，如上表，(0,0)表示第0个句子和第0个句子的相似性，(0,1)表示第0个句子和第一个句子的相似性。可以看出来左上角到右下角对角线的数据都是1，且对角线上下的数据是对称的，这部分数据其实是冗余的。

## 普通矩阵转稀疏矩阵

In [77]:
from scipy.sparse.csr import csr_matrix
sparse_matrix = csr_matrix(cosine_similarity(tt_res))
print sparse_matrix

  (0, 0)	1.0
  (0, 1)	0.438300384476
  (0, 2)	0.103484900093
  (0, 3)	1.0
  (1, 0)	0.438300384476
  (1, 1)	1.0
  (1, 2)	0.0642219320427
  (1, 3)	0.438300384476
  (2, 0)	0.103484900093
  (2, 1)	0.0642219320427
  (2, 2)	1.0
  (2, 3)	0.103484900093
  (3, 0)	1.0
  (3, 1)	0.438300384476
  (3, 2)	0.103484900093
  (3, 3)	1.0


**稀疏矩阵的输出非常清晰的说明了两两之前相似性的关系**。