# Term Frequency - Inverse Document Frequency (TF-IDF)
https://wikidocs.net/31698


In [18]:
# import
import pandas as pd 
from math import log

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Calculating TF-IDF Manually

In [19]:
# data preprocessing
docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
]

vocab = list(set(w for doc in docs for w in doc.split())) # using set to remove duplication
print(vocab)

['저는', '좋아요', '사과', '노란', '바나나', '싶은', '길고', '먹고', '과일이']


# nested list comprehension
## 'for doc in docs': Outer loop. It iterates over each document in the docs list. Each doc is assumed to be a string.
## 'for w in doc.split()': Inner loop. After splitting each doc into words, this loop iterates over each word, which is represented by w.

In [6]:
# functions
def tf(t, d):
    """
    tf(t, d): 특정 문서 d에서 특정 단어 t의 등장 횟수.
    """
    return d.count(t)

def df(t):
    """
    df(t, D): 특정 단어 t가 등장한 문서의 수.
    """
    df = 0
    for doc in docs:
        df += t in doc # increments the variable df by 1 if t exists in the current doc. (True == 1 in arithmetic operations in Python)
    return df

def idf(t):
    """
    idf(t, D): df(t, D)에 반비례 하는 수.
    """
    return log(N / (df(t) + 1)) # log(N / (df(t) + 1))

def tfidf(t, d):
    """
    tfidf(t, d): tf * idf
    """
    return tf(t, d) * idf(t)

In [16]:
# calculation for Document-Term Matrix (DTM)
result = []

for doc in docs:
    result.append([])
    for t in vocab:
        result[-1].append(tf(t, doc))
        
dtm = pd.DataFrame(result, columns=vocab)
print(dtm.shape)
print(dtm)

(4, 9)
   저는  좋아요  사과  노란  바나나  싶은  길고  먹고  과일이
0   0    0   1   0    0   1   0   1    0
1   0    0   0   0    1   1   0   1    0
2   0    0   0   1    2   0   1   0    0
3   1    1   0   0    0   0   0   0    1


In [15]:
# calculation for Inverse Document Frequency (IDF)

result = []
for t in vocab:
    result.append(idf(t))
    
idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
print(idf_)

          IDF
저는   0.693147
좋아요  0.693147
사과   0.693147
노란   0.693147
바나나  0.287682
싶은   0.287682
길고   0.693147
먹고   0.287682
과일이  0.693147


In [17]:
# calculation for TF-IDF
result = []

for doc in docs:
    result.append([])
    for t in vocab:
        result[-1].append(tfidf(t, doc))

tfidf_ = pd.DataFrame(result, index=docs, columns=vocab)
print(tfidf_)

                     저는       좋아요        사과        노란       바나나        싶은  \
먹고 싶은 사과       0.000000  0.000000  0.693147  0.000000  0.000000  0.287682   
먹고 싶은 바나나      0.000000  0.000000  0.000000  0.000000  0.287682  0.287682   
길고 노란 바나나 바나나  0.000000  0.000000  0.000000  0.693147  0.575364  0.000000   
저는 과일이 좋아요     0.693147  0.693147  0.000000  0.000000  0.000000  0.000000   

                     길고        먹고       과일이  
먹고 싶은 사과       0.000000  0.287682  0.000000  
먹고 싶은 바나나      0.000000  0.287682  0.000000  
길고 노란 바나나 바나나  0.693147  0.000000  0.000000  
저는 과일이 좋아요     0.000000  0.000000  0.693147  


# Calculating TF-IDF using scikit-learn

In [31]:
# calculating DTM using CountVectorizer
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

vector = CountVectorizer()
vector.fit_transform(corpus)

print(vector.transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [32]:
# calculating TF-IDF using TfidfVectorizer
tfidfv = TfidfVectorizer()
tfidfv.fit_transform(corpus)

print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
