# BoW

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [3]:
doc_ls = [doc.split() for doc in docs]
word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]

print(word2id)

BoW = []
for i, doc in enumerate(doc_ls):
    bow = np.zeros(len(word2id), dtype=int)
    for token in doc:
        bow[word2id[token]] += 1
    BoW.append(bow)

defaultdict(<function <lambda> at 0x11b8158c0>, {'오늘': 0, '동물원에서': 1, '원숭이를': 2, '봤어': 3, '코끼리를': 4, '원숭이에게': 5, '바나나를': 6, '줬어': 7})


In [4]:
BoW

[array([1, 1, 1, 1, 0, 0, 0, 0]),
 array([1, 1, 0, 2, 1, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 1, 2, 1])]

In [5]:
sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]

for i in range(len(docs)):
    print(pd.DataFrame([BoW[i]], columns=vocab))

   오늘  동물원에서  원숭이를  봤어  코끼리를  원숭이에게  바나나를  줬어
0   1      1     1   1     0      0     0   0
   오늘  동물원에서  원숭이를  봤어  코끼리를  원숭이에게  바나나를  줬어
0   1      1     0   2     1      0     0   0
   오늘  동물원에서  원숭이를  봤어  코끼리를  원숭이에게  바나나를  줬어
0   0      1     0   0     0      1     2   1


# Scikit-Learn CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer()
BoW_cv = cv.fit_transform(docs)

In [8]:
colnames = cv.get_feature_names()
data = BoW_cv.toarray()

In [9]:
data

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]])

In [10]:
colnames

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

### DTM = Document Term Matrix

In [11]:
# DTM
pd.DataFrame(data.T, colnames).T

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,1,1,1,0,0,0
1,1,0,2,1,0,0,0,1
2,1,2,0,0,0,1,1,0


### TDM = Term Document Matrix

In [12]:
# DTM
pd.DataFrame(data.T, colnames)

Unnamed: 0,0,1,2
동물원에서,1,1,1
바나나를,0,0,2
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
원숭이에게,0,0,1
줬어,0,0,1
코끼리를,0,1,0


# gensim corpora

In [13]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)

[id2word.doc2bow(doc) for doc in doc_ls]

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [14]:
doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
BoW_corpora = [id2word.doc2bow(doc) for doc in doc_ls]

BoW_corpora

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [15]:
print(list(id2word.keys()), list(id2word.values()))

[0, 1, 2, 3, 4, 5, 6, 7] ['동물원에서', '봤어', '오늘', '원숭이를', '코끼리를', '바나나를', '원숭이에게', '줬어']


In [16]:
BoW[0]

array([1, 1, 1, 1, 0, 0, 0, 0])

In [17]:
vocab

['오늘', '동물원에서', '원숭이를', '봤어', '코끼리를', '원숭이에게', '바나나를', '줬어']

In [18]:
from gensim.matutils import sparse2full
sparse2full(BoW[0], length=len(vocab))

TypeError: cannot unpack non-iterable numpy.int64 object

# Practice
TF-IDF 구하기

In [19]:
# 사전 생성

docs = ['오늘 동물원에서 원숭이를 봤어',
       '오늘 동물원에서 코끼리를 봤어 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

def create_word2id(docs):
    doc_ls = [doc.split() for doc in docs]
    word2id = defaultdict(lambda : len(word2id))
    [word2id[token] for doc in doc_ls for token in doc]
    return word2id

word2id = create_word2id(docs)
word2id

defaultdict(<function __main__.create_word2id.<locals>.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [20]:
# TF 계산하기
# 개별 빈도수 / 문서내 전체 빈도수
def calc_tf(doc, word2id, log_norm = False, double_norm=False, double_norm_K=0.5):
    doc_sp = doc.split()
    tot_freq = len(doc_sp)
    container = np.zeros(len(word2id))
    
    for token in doc_sp:
        container[word2id[token]] += 1
        
    if double_norm:
        container = double_norm_K + (double_norm_K * container / max(container))
    if log_norm:
        container = np.log(1+container)
    else:
        container /= tot_freq
        
    return container

In [21]:
[calc_tf(doc, word2id) for doc in docs]

[array([0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ]),
 array([0.2, 0.2, 0. , 0.4, 0.2, 0. , 0. , 0. ]),
 array([0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2])]

In [22]:
[calc_tf(doc, word2id, double_norm=1) for doc in docs]

[array([0.25 , 0.25 , 0.25 , 0.25 , 0.125, 0.125, 0.125, 0.125]),
 array([0.15, 0.15, 0.1 , 0.2 , 0.15, 0.1 , 0.1 , 0.1 ]),
 array([0.1 , 0.15, 0.1 , 0.1 , 0.1 , 0.15, 0.2 , 0.15])]

In [23]:
[calc_tf(doc, word2id, double_norm=1, double_norm_K=0.3) for doc in docs]

[array([0.15 , 0.15 , 0.15 , 0.15 , 0.075, 0.075, 0.075, 0.075]),
 array([0.09, 0.09, 0.06, 0.12, 0.09, 0.06, 0.06, 0.06]),
 array([0.06, 0.09, 0.06, 0.06, 0.06, 0.09, 0.12, 0.09])]

In [24]:
[calc_tf(doc, word2id, log_norm=1) for doc in docs]

[array([0.69314718, 0.69314718, 0.69314718, 0.69314718, 0.        ,
        0.        , 0.        , 0.        ]),
 array([0.69314718, 0.69314718, 0.        , 1.09861229, 0.69314718,
        0.        , 0.        , 0.        ]),
 array([0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.69314718, 1.09861229, 0.69314718])]

In [25]:
word2id

defaultdict(<function __main__.create_word2id.<locals>.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [26]:
list(word2id.keys())

['오늘', '동물원에서', '원숭이를', '봤어', '코끼리를', '원숭이에게', '바나나를', '줬어']

In [27]:
# IDF 계산하기
# log(문서 수/토큰이 등장한 문서 수)

def calc_idf(docs, word2id, idf_smooth=False, idf_max=False, prob_idf=False):
    assert (idf_smooth + idf_max + prob_idf) <= 1, "invalid parameter"
    
    num_docs = len(docs)
    tokens = list(word2id.keys())
    container = np.zeros(len(word2id))
    
    for token in tokens:
        count = 0
        for doc in docs:
            if token in doc:
                count += 1
        container[word2id[token]] = count
    
#     print([(w, n) for (w, n) in zip(tokens, container)])
        
    if idf_smooth:
        container = -1 * np.log((1 + container) / num_docs)
    elif idf_max:
        container = -1 * np.log((1 + container) / max(container))
    elif prob_idf:
        container = np.log((num_docs - container)/container)
    else:
        container = -1*np.log(container/num_docs)

    return container     

In [28]:
docs

['오늘 동물원에서 원숭이를 봤어', '오늘 동물원에서 코끼리를 봤어 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [29]:
calc_idf(docs, word2id)

array([ 0.40546511, -0.        ,  1.09861229,  0.40546511,  1.09861229,
        1.09861229,  1.09861229,  1.09861229])

In [30]:
calc_idf(docs, word2id, idf_smooth=True)

array([-0.        , -0.28768207,  0.40546511, -0.        ,  0.40546511,
        0.40546511,  0.40546511,  0.40546511])

In [31]:
calc_idf(docs, word2id, idf_max=True)

array([-0.        , -0.28768207,  0.40546511, -0.        ,  0.40546511,
        0.40546511,  0.40546511,  0.40546511])

In [32]:
calc_idf(docs, word2id, prob_idf=True)



array([-0.69314718,        -inf,  0.69314718, -0.69314718,  0.69314718,
        0.69314718,  0.69314718,  0.69314718])

In [33]:
tfs = [calc_tf(doc, word2id) for doc in docs]
idf = calc_idf(docs, word2id)

In [34]:
def calc_tfidf(tfs, idf):
    return [tf*idf for tf in tfs]

In [35]:
tfidf = calc_tfidf(tfs, idf)
tfidf

[array([ 0.10136628, -0.        ,  0.27465307,  0.10136628,  0.        ,
         0.        ,  0.        ,  0.        ]),
 array([ 0.08109302, -0.        ,  0.        ,  0.16218604,  0.21972246,
         0.        ,  0.        ,  0.        ]),
 array([ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.21972246,  0.43944492,  0.21972246])]

In [36]:
np.array(tfidf)

array([[ 0.10136628, -0.        ,  0.27465307,  0.10136628,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.08109302, -0.        ,  0.        ,  0.16218604,  0.21972246,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.21972246,  0.43944492,  0.21972246]])

In [37]:
res_df = pd.DataFrame(np.array(tfidf).T, list(word2id.keys())).T
res_df

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.101366,-0.0,0.274653,0.101366,0.0,0.0,0.0,0.0
1,0.081093,-0.0,0.0,0.162186,0.219722,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.219722,0.439445,0.219722


### Summary!

In [44]:
docs2 = ["The cat sat on my face I hate a cat",
        "The dog sat on my bed I love a dog"]

def get_tfidf_from_docs(docs, verbose=0):
    word2id = create_word2id(docs)
    tfs = [calc_tf(doc, word2id) for doc in docs]
    idf = calc_idf(docs, word2id)
    tfidf = calc_tfidf(tfs, idf)
    
    if verbose:
        print(pd.DataFrame(np.array(tfidf).T, list(word2id.keys())).T)
    
    return tfidf, word2id

tfidf2, word2id_2 = get_tfidf_from_docs(docs2, verbose=1)
tfidf2

   The       cat  sat   on   my      face    I      hate    a       dog  \
0 -0.0  0.138629 -0.0 -0.0 -0.0  0.069315 -0.0  0.069315 -0.0  0.000000   
1 -0.0  0.000000 -0.0 -0.0 -0.0  0.000000 -0.0  0.000000 -0.0  0.138629   

        bed      love  
0  0.000000  0.000000  
1  0.069315  0.069315  


[array([-0.        ,  0.13862944, -0.        , -0.        , -0.        ,
         0.06931472, -0.        ,  0.06931472, -0.        ,  0.        ,
         0.        ,  0.        ]),
 array([-0.        ,  0.        , -0.        , -0.        , -0.        ,
         0.        , -0.        ,  0.        , -0.        ,  0.13862944,
         0.06931472,  0.06931472])]

In [45]:
res_df2 = pd.DataFrame(np.array(tfidf2).T, list(word2id_2.keys())).T
res_df2

Unnamed: 0,The,cat,sat,on,my,face,I,hate,a,dog,bed,love
0,-0.0,0.138629,-0.0,-0.0,-0.0,0.069315,-0.0,0.069315,-0.0,0.0,0.0,0.0
1,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.138629,0.069315,0.069315


In [46]:
best_tfidf_index = np.array(tfdif2).argmax(axis=1)
res_df2.iloc[:, best_tfidf_index]

Unnamed: 0,cat,dog
0,0.138629,0.0
1,0.0,0.138629


# Scikit-learn TFidVectorizer

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
docs

['오늘 동물원에서 원숭이를 봤어', '오늘 동물원에서 코끼리를 봤어 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [78]:
# smooth -> False
tfidf_vectorizer = TfidfVectorizer(smooth_idf=False)
tfidf = tfidf_vectorizer.fit_transform(docs)

pd.DataFrame(tfidf.todense(),
             columns=tfidf_vectorizer.get_feature_names())

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.32695,0.0,0.459517,0.459517,0.686142,0.0,0.0,0.0
1,0.255815,0.0,0.719079,0.359539,0.0,0.0,0.0,0.536857
2,0.190953,0.801472,0.0,0.0,0.0,0.400736,0.400736,0.0


In [79]:
# smooth -> True

tfidf_vectorizer = TfidfVectorizer(smooth_idf=True)
tfidf = tfidf_vectorizer.fit_transform(docs)

pd.DataFrame(tfidf.todense(),
             columns=tfidf_vectorizer.get_feature_names())

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.373119,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.0
1,0.286801,0.0,0.738616,0.369308,0.0,0.0,0.0,0.485596
2,0.2344,0.793749,0.0,0.0,0.0,0.396875,0.396875,0.0


In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vectorizer = CountVectorizer()
DTM = count_vectorizer.fit_transform(docs)
DTM.data

array([1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1])

In [57]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(DTM)
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [58]:
count_vectorizer.get_feature_names()

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [60]:
tfidf_vectorizer.vocabulary_

{'오늘': 3,
 '동물원에서': 0,
 '원숭이를': 4,
 '봤어': 2,
 '코끼리를': 7,
 '원숭이에게': 5,
 '바나나를': 1,
 '줬어': 6}

In [61]:
vocab = tfidf_vectorizer.get_feature_names()
vocab

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [65]:
print(docs)

['오늘 동물원에서 원숭이를 봤어', '오늘 동물원에서 코끼리를 봤어 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']


In [63]:
pd.DataFrame(tfidf.todense(), columns=vocab)

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.373119,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.0
1,0.286801,0.0,0.738616,0.369308,0.0,0.0,0.0,0.485596
2,0.2344,0.793749,0.0,0.0,0.0,0.396875,0.396875,0.0


# gensim Tfidf

In [70]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

In [71]:
doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
DTM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(DTM)
tfidf = model[DTM]
tfidf

<gensim.interfaces.TransformedCorpus at 0x1a32df9510>

In [80]:
from gensim.matutils import sparse2full

vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(DTM_matrix, columns=vocab)

ValueError: not enough values to unpack (expected 2, got 1)