# **문장의 표현**

## 1 BoW (Bag of Words)

### 1.1 직접구현

In [1]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

#### 1) 띄어쓰기 단위로 토큰화

In [2]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

#### 2) 각 고유 토큰에 인덱스를 지정

In [3]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

#### 3) BoW 생성

In [4]:
import numpy as np

BoW_ls = []

for i, doc in enumerate(doc_ls):
    bow = np.zeros(len(word2id), dtype=int)
    for token in doc:
        bow[word2id[token]] += 1
    BoW_ls.append(bow.tolist())
BoW_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [5]:
word2id.items()

dict_items([('오늘', 0), ('동물원에서', 1), ('원숭이를', 2), ('봤어', 3), ('코끼리를', 4), ('원숭이에게', 5), ('바나나를', 6), ('줬어', 7)])

In [6]:
from IPython.core import display as ICD
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
for i in range(len(docs)):
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
    print("\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,1,1,0,0,0,0




문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,0,2,1,0,0,0




문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0,1,0,0,0,1,2,1






### 1.3 sklearn 활용

In [7]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)

BoW.toarray()[0]

array([1, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

In [9]:
from IPython.core import display as ICD
import pandas as pd

vocab = count_vect.get_feature_names()
for i in range(len(docs)):
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW.toarray()[i]], columns=vocab))
    print("\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,1,1,1,0,0,0




문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,2,1,0,0,0,1




문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,2,0,0,0,1,1,0






### 1.4 gensim 활용

In [10]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [11]:
import gensim
import numpy as np
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
BoW = [id2word.doc2bow(doc) for doc in doc_ls]
BoW[0]

[(0, 1), (1, 1), (2, 1), (3, 1)]

In [12]:
from gensim.matutils import sparse2full
from IPython.core import display as ICD

vocab = [id2word[i] for i in id2word.keys()]
for i in range(len(docs)):
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([sparse2full(BoW[0], len(vocab))], columns=vocab))
    print("\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0




문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0




문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0






## 2 TDM(Term-Document Matrix)

### 2.1 직접구현

In [13]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

#### 1) 띄어쓰기 단위로 토큰화

In [14]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

#### 2) 각 고유 토큰에 인덱스(index)를 지정

In [15]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

#### 3) TDM 생성

In [16]:
import numpy as np

TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token], i] += 1
TDM

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

In [17]:
import pandas as pd

doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오늘,1,1,0
동물원에서,1,1,1
원숭이를,1,0,0
봤어,1,2,0
코끼리를,0,1,0
원숭이에게,0,0,1
바나나를,0,0,2
줬어,0,0,1


### 2.2 sklearn 활용

In [18]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]], dtype=int64)

In [20]:
import pandas as pd

doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
바나나를,0,0,2
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
원숭이에게,0,0,1
줬어,0,0,1
코끼리를,0,1,0


### 2.3 gensim 활용

In [21]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [22]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [23]:
import pandas as pd

doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [gensim.matutils.sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
코끼리를,0,1,0
바나나를,0,0,2
원숭이에게,0,0,1
줬어,0,0,1


## 3 TF-IDF (Term Frequency-Inverse Document Frequency)
- TF(단어빈도,Term Frequency): 단어가 문서 내에 등장하는 빈도
- IDF(역문서 빈도,Inverse Document Frequency): 단어가 여러 문서에 공통적으로 등장하는 빈도
- 한 문서 내에 자주 등장하고 다른 문서에 자중 등장하지 않는 단어를 주요 단어로 판별할 수 있음

### 3.1 직접 계산하기 1

In [24]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

#### 1) 띄어쓰기 단위로 토큰화

In [25]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

#### 2) 각 고유 토큰에 인덱스(index)를 지정

In [26]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

#### 3) TDM 생성

In [27]:
import numpy as np

TDM = np.zeros((len(doc_ls), len(word2id)), dtype=int)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[i, word2id[token]] += 1  # 해당 토큰의 위치
TDM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

#### 4) TF 계산
TF = 문서내 토큰빈도/문서내 전체토큰갯수

In [28]:
def computeTF(TDM):
    doc_len = len(doc_ls)
    word_len = len(TDM[0])
    
    tf = np.zeros((len(doc_ls),len(word2id)))
    
    for doc_i in range(doc_len):
        for word_i in range(word_len):
            tf[doc_i, word_i] = TDM[doc_i, word_i] / TDM[doc_i].sum()
    return tf

In [29]:
computeTF(TDM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

#### 5) IDF 계산
IDF = log(총 문서수/토큰이 등장한 문서수) = -log(토큰이 등장한 문서수/총 문서수)

In [30]:
word_len = len(TDM[0])
for i in range(word_len):
    print(TDM[:,i])
    print("=========")

[1 1 0]
[1 1 1]
[1 0 0]
[1 2 0]
[0 1 0]
[0 0 1]
[0 0 2]
[0 0 1]


In [31]:
import numpy as np

import numpy as np

def computeIDF(TDM):
    doc_len = len(TDM)
    word_len = len(TDM[0])
    
    idf = np.zeros(word_len)
    for i in range(word_len):
        idf[i] = -np.log(np.count_nonzero(TDM[:,i])/doc_len)
    
    return idf

In [32]:
computeIDF(TDM)

array([ 0.40546511, -0.        ,  1.09861229,  0.40546511,  1.09861229,
        1.09861229,  1.09861229,  1.09861229])

#### 6) TF-IDF 계산

In [33]:
tf = computeTF(TDM)

tf.shape

(3, 8)

In [34]:
def computeTFIDF(TDM):
    tf = computeTF(TDM)
    idf = computeIDF(TDM)
    
    tfidf = np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]):
        for word_i in range(tf.shape[1]):
            tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
    return tfidf

In [35]:
computeTFIDF(TDM)

array([[ 0.10136628, -0.        ,  0.27465307,  0.10136628,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.08109302, -0.        ,  0.        ,  0.16218604,  0.21972246,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.21972246,  0.43944492,  0.21972246]])

In [36]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
tfidf = computeTFIDF(TDM)
pd.DataFrame(tfidf, columns = vocab)

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.101366,-0.0,0.274653,0.101366,0.0,0.0,0.0,0.0
1,0.081093,-0.0,0.0,0.162186,0.219722,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.219722,0.439445,0.219722


### 3.2 직접 계산하기 2
 - tf = 0.5 + 0.5(토큰빈도/문서내 최빈토큰)
 - idf = log(문서갯수/(1+토큰빈도)

In [37]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [38]:
D_ls = [d.split() for d in docs]
vocab = list(set().union(*D_ls))
vocab

['봤어', '줬어', '바나나를', '오늘', '원숭이를', '동물원에서', '원숭이에게', '코끼리를']

In [39]:
import numpy as np
from collections import defaultdict

# document내 토큰이 등장한 빈도수 계산
def f(t, d):
    return d.count(t)

# tf 계산
def tf(t, d):
    return 0.5 + 0.5*(f(t,d)/max([f(w,d) for w in d]))

# idf 계산
def idf(t, D):
    numerator = len(D)
    denominator = 1 + len([True for d in D if t in d])
    return np.log(numerator/denominator)

# tf-idf 계산
def tfidf_score(t, d, D):
    return tf(t,d)*idf(t,D)

# 공백을 기준으로 토큰화
def tfidfScorer(D):
    D_ls = [d.split() for d in D]
    vocab = list(set().union(*D_ls))
    word2id = defaultdict(lambda : len(word2id))
    [word2id[v] for v in vocab]
    
    tfidf = np.zeros((len(D_ls), len(vocab)))
    
    for i in range(len(D_ls)):
        for t in D_ls[i]:
            tfidf[i, word2id[t]] = tfidf_score(t, D_ls[i], D_ls)
    return tfidf, vocab

In [40]:
tfidfScorer(docs)

(array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.40546511,
         -0.28768207,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         -0.21576155,  0.        ,  0.30409883],
        [ 0.        ,  0.30409883,  0.40546511,  0.        ,  0.        ,
         -0.21576155,  0.30409883,  0.        ]]),
 ['봤어', '줬어', '바나나를', '오늘', '원숭이를', '동물원에서', '원숭이에게', '코끼리를'])

In [41]:
import pandas as pd
tfidf, vocab = tfidfScorer(docs)
pd.DataFrame(tfidf, columns=vocab)

Unnamed: 0,봤어,줬어,바나나를,오늘,원숭이를,동물원에서,원숭이에게,코끼리를
0,0.0,0.0,0.0,0.0,0.405465,-0.287682,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,-0.215762,0.0,0.304099
2,0.0,0.304099,0.405465,0.0,0.0,-0.215762,0.304099,0.0


### 3.3 sklearn 활용

In [42]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(docs)
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [44]:
import pandas as pd

vocab = tfidf_vect.get_feature_names()
pd.DataFrame(tfidf.todense(), columns=vocab)

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.373119,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.0
1,0.286801,0.0,0.738616,0.369308,0.0,0.0,0.0,0.485596
2,0.2344,0.793749,0.0,0.0,0.0,0.396875,0.396875,0.0


### 3.4 gensim 활용

In [45]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [46]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls = [doc.split() for doc in docs]  # 공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(TDM)
tfidf = model[TDM] 
tfidf[0]

[(1, 0.32718457421365993), (2, 0.32718457421365993), (3, 0.8865102981879297)]

In [47]:
from gensim.matutils import sparse2full

vocab = [id2word[i] for i in id2word.keys()]
TDM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(TDM_matrix, columns=vocab)

Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,0.0,0.327185,0.327185,0.88651,0.0,0.0,0.0,0.0
1,0.0,0.569307,0.284654,0.0,0.771272,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248
