# Sentence Representation

# 1 BoW (Bag of Words)

<img src="https://image.slidesharecdn.com/vector-space-models-170118145044/95/cs571-vector-space-models-3-638.jpg?cb=1485433004" />

https://en.wikipedia.org/wiki/Bag-of-words_model
https://www.slideshare.net/jchoi7s/cs571-vector-space-models

## 1.1 직접구현

In [1]:
import pandas as pd

In [2]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [3]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [4]:
from collections import defaultdict
word2id = defaultdict(lambda : len(word2id))
[word2id[token]for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

### 3) BoW 생성

In [5]:
###############################
import numpy as np 

BoW_ls = []

for i ,doc in enumerate(doc_ls):
    bow = np.zeros(len(word2id), dtype=int)
    for word in doc:
        bow[word2id[word]] +=1
    BoW_ls.append(bow.tolist())
BoW_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [6]:
from IPython.core import display as ICD

sorted_vocab = sorted((value, key) for key, value in word2id.items())
sorted_vocab
vocab = [item[1] for item in sorted_vocab]
for i in range(len(docs)):
    print("문서{}:{}".format(i,docs[i]))
    ICD.display(pd.DataFrame([BoW_ls[i]], columns = vocab))
    print()

문서0:오늘 동물원에서 원숭이를 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,1,1,0,0,0,0



문서1:오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,0,2,1,0,0,0



문서2:동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0,1,0,0,0,1,2,1







---





## 1.2 단어 순서를 고려하지 않은 BoW

In [7]:
docs = ['나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해',
        '나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 싫어해']

### 1) 띄어쓰기 단위로 토큰화

In [8]:
doc_ls = [doc.split() for doc in docs]

doc_ls

[['나는', '양념', '치킨을', '좋아해', '하지만', '후라이드', '치킨을', '싫어해'],
 ['나는', '후라이드', '치킨을', '좋아해', '하지만', '양념', '치킨을', '싫어해']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [9]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
for doc in doc_ls:
    for token in doc:
        word2id[token]
        
# word2id = {}
# for doc in docs_ls:
#     for token in doc:
#         if not token in word2id:
#             word2id[token] = len(word2id)
        
word2id   

defaultdict(<function __main__.<lambda>()>,
            {'나는': 0,
             '양념': 1,
             '치킨을': 2,
             '좋아해': 3,
             '하지만': 4,
             '후라이드': 5,
             '싫어해': 6})

### 3) BoW 생성

In [10]:
import numpy as np 

BoW_ls = []
for i, doc in enumerate(doc_ls):
    bow = np.zeros(len(word2id), dtype=int)
    for word in doc:
        bow[word2id[word]] += 1
    BoW_ls.append(bow.tolist())

BoW_ls

[[1, 1, 2, 1, 1, 1, 1], [1, 1, 2, 1, 1, 1, 1]]

In [11]:
from IPython.core import display as ICD

# bow 의 한계
# 두 문장의 의미는 다르지만완전 똑같은 백터를 나타낸다.



---



https://en.wikipedia.org/wiki/Document-term_matrix

## 1.3 sklearn 활용

In [12]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
#corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()
print(vector.fit_transform(docs).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.





[[1 0 1 1 1 0 0 0]
 [1 0 2 1 0 0 0 1]
 [1 2 0 0 0 1 1 0]]
{'오늘': 3, '동물원에서': 0, '원숭이를': 4, '봤어': 2, '코끼리를': 7, '원숭이에게': 5, '바나나를': 1, '줬어': 6}


In [14]:
from IPython.core import display as ICD

vocab = vector.get_feature_names()
print(vocab)
vector.vocabulary_

for i in range(len(docs)):
    print("문서{} : {}".format(i,docs[i]))
    ICD.display(pd.DataFrame([[]]))

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']
문서0 : 오늘 동물원에서 원숭이를 봤어


0


문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


0


문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


0




---


## 1.4 gensim 활용

In [15]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [16]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
BoW = [id2word.doc2bow(doc) for doc in doc_ls]
BoW



[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [17]:
print(id2word.token2id)
id2word.doc2bow('오늘 동물원에서 원숭이를 봤어'.split())

{'동물원에서': 0, '봤어': 1, '오늘': 2, '원숭이를': 3, '코끼리를': 4, '바나나를': 5, '원숭이에게': 6, '줬어': 7}


[(0, 1), (1, 1), (2, 1), (3, 1)]

In [18]:
from gensim.matutils import sparse2full
from IPython.core import display as ICD

vocab = [id2word[i] for i in id2word.keys()]
for i in range(len(docs)):
  print("문서 {} : {}".format(i, docs[i]))
  ICD.display(pd.DataFrame([sparse2full(BoW[i], len(vocab))], columns=vocab))
  print("\n")

문서 0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0




문서 1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0




문서 2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0








---



# 2 TDM(Term-Document Matrix)

## 2.1 직접구현

In [19]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [20]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [21]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id
      

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

### 3) TDM 생성

In [22]:
import numpy as np 
TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token],i] += 1

TDM

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

In [23]:
import pandas as pd

doc_name = ['문서{}'.format(i) for i in range(len(doc_ls))]

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_name)

df_TDM['단어'] = vocab
df_TDM.set_index('단어')



Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오늘,1,1,0
동물원에서,1,1,1
원숭이를,1,0,0
봤어,1,2,0
코끼리를,0,1,0
원숭이에게,0,0,1
바나나를,0,0,2
줬어,0,0,1


## 2.2 sklearn 활용

In [24]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]])

In [26]:
doc_name = ['문서{}'.format(i) for i in range(len(doc_ls))]


vocab = count_vect.get_feature_names()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_name)

df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
바나나를,0,0,2
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
원숭이에게,0,0,1
줬어,0,0,1
코끼리를,0,1,0




---


## 2.3 gensim 활용

In [27]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [28]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [29]:
doc_name = ['문서{}'.format(i) for i in range(len(doc_ls))]

vocab = [id2word[i] for i in id2word.keys()]
DTM = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]
df_TDM = pd.DataFrame(np.array(DTM, dtype=int).T, columns=doc_name)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
코끼리를,0,1,0
바나나를,0,0,2
원숭이에게,0,0,1
줬어,0,0,1


---

# 3 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음


https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## 3.1 직접계산하기 1

weighting schema|weight
--|--
tf (term frequency)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />
idf(inverse document frequency) |<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

In [31]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [32]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [33]:
from collections import defaultdict

word2id = defaultdict(lambda: len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id
     

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

### 3) DTM 생성

In [34]:
import numpy as np 
DTM = np.zeros((len(doc_ls),len(word2id)),dtype=int)
for i,doc in enumerate(doc_ls):
    for token in doc:
        DTM[i,word2id[token]] += 1

DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

### 4) TF 계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />

TF = 문서내 토크빈도/ 문서내 전체토큰갯수

In [35]:
def computeTF(DTM):
    doc_len = len(doc_ls)
    word_len = len(word2id)
    tf = np.zeros((doc_len,word_len))
    for doc_idx in range(doc_len):
        for word_idx in range(word_len):
            tf[doc_idx, word_idx] = DTM[doc_idx, word_idx]/DTM[doc_idx].sum()
            
    return tf

computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

### 5) IDF  계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

IDF = log(총문서수/토큰이등장한문서수)

In [36]:
import math

def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    idf = np.zeros(word_len)
    for i in range(word_len):
        idf[i] = -math.log10(np.count_nonzero(DTM[:,i])/doc_len)
        
    return idf
# np.count_nonzero 사용
      
computeIDF(DTM)    

array([ 0.17609126, -0.        ,  0.47712125,  0.17609126,  0.47712125,
        0.47712125,  0.47712125,  0.47712125])

### 6) TF-IDF 계산

In [37]:
def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    
    tfidf = np.zeros(tf.shape)
    for doc_idx in range(tf.shape[0]):
        for word_idx in range(tf.shape[1]):
            tfidf[doc_idx, word_idx] = tf[doc_idx, word_idx]* idf[word_idx]
    return tfidf

  
computeTFIDF(DTM)

array([[ 0.04402281, -0.        ,  0.11928031,  0.04402281,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.03521825, -0.        ,  0.        ,  0.0704365 ,  0.09542425,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.09542425,  0.1908485 ,  0.09542425]])

In [38]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1]for v in sorted_vocab]
tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.044023,-0.0,0.11928,0.044023,0.0,0.0,0.0,0.0
1,0.035218,-0.0,0.0,0.070437,0.095424,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.095424,0.190849,0.095424


## 3.2 직접계산하기2

weighting schema|weight|설명
--|--|--
tf(double normalization 0.5)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" />|=0.5 + 0.5(토큰빈도/문서내최빈토큰)
idf(inverse document frequency smooth)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25f4d6690acaaef1f15f308d24f6f8a439de971d" />|=log(문서갯수/(1+토큰빈도)) + 1

In [39]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [40]:
D = [doc.split() for doc in docs]
print(D)
print(D[1])
D[1].count('봤어')

[['오늘', '동물원에서', '원숭이를', '봤어'], ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'], ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]
['오늘', '동물원에서', '코끼리를', '봤어', '봤어']


2

In [41]:
from math import log10
import numpy as np
from collections import defaultdict

# document 내 토큰이 등장한 빈도수 계산


def f(t, d):
    return d.count(t)

# tf 계산


def tf(t, d):
    return 0.5 + 0.5 * f(t, d) / max([f(w, d) for w in d])

# idf 계산


def idf(t, D):
    N = len(D)
    nt = len([True for d in D if t in d])
    return 1 + log10(N/(1 + nt))


# tf-idf 계산
def tfidf_score(t, d, D):
    return tf(t, d) * idf(t, D)

# 공백을 기준으로 토큰과


def tokenizer(d):
    return d.split()


# tfidf 계산
def tfidfScorer(D):
    D_ls = [tokenizer(d) for d in D]
    vocab = list(set().union(*D_ls))
    print(vocab)
    print(len(vocab))
    word2id = defaultdict(lambda: len(word2id))
    [word2id[v] for v in vocab]
    print(word2id)
    tfidf = np.zeros((len(D_ls), len(word2id)))
    for i in range(len(D_ls)):
        for t in D_ls[i]:
            tfidf[i, word2id[t]] = tfidf_score(t, D_ls[i], D_ls)

    return tfidf, vocab


tfidfScorer(docs)

['원숭이에게', '원숭이를', '줬어', '코끼리를', '봤어', '오늘', '동물원에서', '바나나를']
8
defaultdict(<function tfidfScorer.<locals>.<lambda> at 0x7f89a024e700>, {'원숭이에게': 0, '원숭이를': 1, '줬어': 2, '코끼리를': 3, '봤어': 4, '오늘': 5, '동물원에서': 6, '바나나를': 7})


(array([[0.        , 1.17609126, 0.        , 0.        , 1.        ,
         1.        , 0.87506126, 0.        ],
        [0.        , 0.        , 0.        , 0.88206844, 1.        ,
         0.75      , 0.65629595, 0.        ],
        [0.88206844, 0.        , 0.88206844, 0.        , 0.        ,
         0.        , 0.65629595, 1.17609126]]),
 ['원숭이에게', '원숭이를', '줬어', '코끼리를', '봤어', '오늘', '동물원에서', '바나나를'])

In [42]:
import pandas as pd
tfidf, vocab = tfidfScorer(docs)

pd.DataFrame(tfidf, columns=vocab)

['원숭이에게', '원숭이를', '줬어', '코끼리를', '봤어', '오늘', '동물원에서', '바나나를']
8
defaultdict(<function tfidfScorer.<locals>.<lambda> at 0x7f89a024ea60>, {'원숭이에게': 0, '원숭이를': 1, '줬어': 2, '코끼리를': 3, '봤어': 4, '오늘': 5, '동물원에서': 6, '바나나를': 7})


Unnamed: 0,원숭이에게,원숭이를,줬어,코끼리를,봤어,오늘,동물원에서,바나나를
0,0.0,1.176091,0.0,0.0,1.0,1.0,0.875061,0.0
1,0.0,0.0,0.0,0.882068,1.0,0.75,0.656296,0.0
2,0.882068,0.0,0.882068,0.0,0.0,0.0,0.656296,1.176091


## 3.3 sklearn 활용

In [44]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(docs)
tfidf
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [48]:
tfidf_vect.get_feature_names()

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [49]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

count_vect = CountVectorizer(max_df=0.85)
DTM = count_vect.fit_transform(docs)
tfidf_trans = TfidfTransformer()
tfidf = tfidf_trans.fit_transform(DTM)
tfidf.todense()

matrix([[0.        , 0.51785612, 0.51785612, 0.68091856, 0.        ,
         0.        , 0.        ],
        [0.        , 0.77100584, 0.38550292, 0.        , 0.        ,
         0.        , 0.50689001],
        [0.81649658, 0.        , 0.        , 0.        , 0.40824829,
         0.40824829, 0.        ]])

In [50]:
vector.get_feature_names()

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [51]:
import pandas as pd
vocab = count_vect.get_feature_names()
pd.DataFrame(tfidf.todense(), columns=vocab)

Unnamed: 0,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.0,0.517856,0.517856,0.680919,0.0,0.0,0.0
1,0.0,0.771006,0.385503,0.0,0.0,0.0,0.50689
2,0.816497,0.0,0.0,0.0,0.408248,0.408248,0.0
