In [1]:
docs = ["나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해", 
        "나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 좋아해"]

In [2]:
doc_ls = [doc.split() for doc in docs ]
doc_ls

[['나는', '양념', '치킨을', '좋아해', '하지만', '후라이드', '치킨을', '싫어해'],
 ['나는', '후라이드', '치킨을', '좋아해', '하지만', '양념', '치킨을', '좋아해']]

In [3]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))

[word2id[token] for doc in doc_ls for token in doc ]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'나는': 0,
             '양념': 1,
             '치킨을': 2,
             '좋아해': 3,
             '하지만': 4,
             '후라이드': 5,
             '싫어해': 6})

In [5]:
import numpy as np

BoW_ls = []

for doc in doc_ls:
    bow = np.zeros(len(word2id), dtype=int)
    for token in doc:
        bow[word2id[token]] += 1
    BoW_ls.append(bow.tolist())
    
BoW_ls

[[1, 1, 2, 1, 1, 1, 1], [1, 1, 2, 2, 1, 1, 0]]

In [7]:
import pandas as pd
from IPython.core import display as ICD

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
for i in range(len(docs)):
    print("문서 {} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
    print("\n")

문서 0 : 나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해


Unnamed: 0,나는,양념,치킨을,좋아해,하지만,후라이드,싫어해
0,1,1,2,1,1,1,1




문서 1 : 나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 좋아해


Unnamed: 0,나는,양념,치킨을,좋아해,하지만,후라이드,싫어해
0,1,1,2,2,1,1,0






In [8]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp37-cp37m-win_amd64.whl (1.6 MB)
Installing collected packages: Cython, gensim
Successfully installed Cython-0.29.23 gensim-4.1.2


In [10]:
import gensim
from gensim import corpora

docs = ["나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해", 
        "나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 좋아해"]

doc_ls = [doc.split() for doc in docs ]


id2word = corpora.Dictionary(doc_ls)
BoW =[id2word.doc2bow(doc) for doc in doc_ls]
BoW

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1)],
 [(0, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1)]]

In [11]:
id2word.keys()

[0, 1, 2, 3, 4, 5, 6]

In [13]:
for key in id2word.keys():
    print("{} : {}".format(key, id2word[key]))

0 : 나는
1 : 싫어해
2 : 양념
3 : 좋아해
4 : 치킨을
5 : 하지만
6 : 후라이드


In [16]:
from gensim.matutils import sparse2full

sparse2full(BoW[0], len(vocab))

array([1., 1., 1., 1., 2., 1., 1.], dtype=float32)

In [18]:

vocab = [id2word[key] for key in id2word.keys()]
for i in range(len(docs)):
    print("문서 {} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([sparse2full(BoW[i], len(vocab))], columns=vocab))
    print("\n")

문서 0 : 나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해


Unnamed: 0,나는,싫어해,양념,좋아해,치킨을,하지만,후라이드
0,1.0,1.0,1.0,1.0,2.0,1.0,1.0




문서 1 : 나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 좋아해


Unnamed: 0,나는,싫어해,양념,좋아해,치킨을,하지만,후라이드
0,1.0,0.0,1.0,2.0,2.0,1.0,1.0






In [20]:
TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
TDM

array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0]])

In [21]:
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token], i] += 1
TDM

array([[1, 1],
       [1, 1],
       [2, 2],
       [1, 2],
       [1, 1],
       [1, 1],
       [1, 0]])

In [27]:
doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어', inplace=True)
df_TDM

Unnamed: 0_level_0,문서0,문서1
단어,Unnamed: 1_level_1,Unnamed: 2_level_1
나는,1,1
양념,1,1
치킨을,2,2
좋아해,1,2
하지만,1,1
후라이드,1,1
싫어해,1,0


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 1, 1, 1, 2, 1, 1],
       [1, 0, 1, 2, 2, 1, 1]], dtype=int64)

In [29]:
doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names_out()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어', inplace=True)
df_TDM

Unnamed: 0_level_0,문서0,문서1
단어,Unnamed: 1_level_1,Unnamed: 2_level_1
나는,1,1
싫어해,1,0
양념,1,1
좋아해,1,2
치킨을,2,2
하지만,1,1
후라이드,1,1


In [30]:
docs = ["오늘 동물원에서 원숭이를 봤어", 
       "오늘 동물원에서 코끼리를 봤어 봤어",
       "동물원에서 원숭이에게 바나나를 줬어 바나나를"]

In [32]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [34]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [36]:
DTM = np.zeros((len(doc_ls), len(word2id)), dtype=int)
DTM

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [37]:
for i, doc in enumerate(doc_ls):
    for token in doc:
        DTM[i, word2id[token]] += 1
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [39]:
DTM[0].sum()

4

In [45]:
TF = np.zeros((len(doc_ls), len(word2id)))

doc_len = len(doc_ls)
word_len = len(word2id)

for i  in range(doc_len):
    sum_doc = DTM[i].sum()
    for j in range(word_len):
        TF[i, j] = DTM[i, j] / float(sum_doc)
        
        
TF

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

In [49]:
DTM[:, 3]

array([1, 2, 0])

In [51]:
import math

IDF = np.zeros(word_len)
for i in range(word_len):
    IDF[i] = -math.log10(np.count_nonzero([DTM[:, i]])  / doc_len)
    
IDF


array([ 0.17609126, -0.        ,  0.47712125,  0.17609126,  0.47712125,
        0.47712125,  0.47712125,  0.47712125])

In [52]:
TFIDF = np.zeros(TF.shape)
TFIDF

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.]])

In [53]:
for i in range(doc_len):
    for j in range(word_len):
        TFIDF[i, j] = TF[i, j] * IDF[j]
        
TFIDF

array([[ 0.04402281, -0.        ,  0.11928031,  0.04402281,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.03521825, -0.        ,  0.        ,  0.0704365 ,  0.09542425,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.09542425,  0.1908485 ,  0.09542425]])