## TF-IDF 실습
- 사이킷런 활용

1) 한글 문장으로 TF-IDF 코사인 유사도 구하기 `CountVectorizer` 만 이용<br>
2) 영어 문장으로 코사인 우사도 구하기 `TfidfVectorizer` 이용

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

corpus = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 

vect = CountVectorizer()
document_term_matrix = vect.fit_transform(corpus)       # 문서-단어 행렬 

tf = pd.DataFrame(document_term_matrix.toarray(), columns=vect.get_feature_names())  
                                             # TF (Term Frequency)
D = len(tf)
df = tf.astype(bool).sum(axis=0)
idf = np.log((D+1) / (df+1)) + 1             # IDF (Inverse Document Frequency)

# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = tf * idf                      
tfidf = tfidf / np.linalg.norm(tfidf, axis=1, keepdims=True)

In [2]:
tf

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [3]:
tfidf

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
1,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
2,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
3,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [7]:
tfidf.loc[0]

과일이    0.000000
길고     0.000000
노란     0.000000
먹고     0.526405
바나나    0.000000
사과     0.667679
싶은     0.526405
저는     0.000000
좋아요    0.000000
Name: 0, dtype: float64

In [10]:
from numpy.linalg import norm
def cos_sim(A, B):
    return np.dot(A, B)/(norm(A)*norm(B))
cos_sim(tfidf.loc[1],tfidf.loc[2])

0.4298082367242732

In [21]:
sim_with_2nd=[]
for i in range(4):
  sim_with_2nd.append(cos_sim(tfidf.loc[1],tfidf.loc[i]))
sim_with_2nd

[0.7345619527913894, 0.9999999999999999, 0.36728097639569474, 0.0]

In [23]:
tfidf['similarity with 1'] = sim_with_2nd
tfidf

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요,similarity with 0,similarity with 1
0,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0,1.0,0.734562
1,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0,0.607841,1.0
2,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0,0.0,0.367281
3,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0


## 영어 문장으로 실습

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
  'John likes to watch movies and Mary likes movies too',
  'James likes to watch TV',
  'Mary also likes to watch football games',  
]

tfidfv = TfidfVectorizer().fit(corpus)
vocab = list(tfidfv.vocabulary_.keys()) # 단어장을 리스트로 저장
vocab.sort() # 단어장을 알파벳 순으로 정렬

# TF-IDF 행렬에 단어장을 데이터프레임의 열로 지정하여 데이터프레임 생성
tfidf_ = pd.DataFrame(tfidfv.transform(corpus).toarray(), columns = vocab)
tfidf_

Unnamed: 0,also,and,football,games,james,john,likes,mary,movies,to,too,tv,watch
0,0.0,0.321556,0.0,0.0,0.0,0.321556,0.379832,0.244551,0.643111,0.189916,0.321556,0.0,0.189916
1,0.0,0.0,0.0,0.0,0.572929,0.0,0.338381,0.0,0.0,0.338381,0.0,0.572929,0.338381
2,0.464997,0.0,0.464997,0.464997,0.0,0.0,0.274634,0.353642,0.0,0.274634,0.0,0.0,0.274634


In [18]:
len(tfidf_)

3

In [19]:
sim_with_1=[]
for i in range(len(tfidf_)):
  sim_with_1.append(cos_sim(tfidf.loc[0],tfidf.loc[i]))
sim_with_1

[1.0, 0.7345619527913894, 0.0]

In [20]:
tfidf_['simiarity with 1st']=sim_with_1
tfidf_

Unnamed: 0,also,and,football,games,james,john,likes,mary,movies,to,too,tv,watch,simiarity with 1st
0,0.0,0.321556,0.0,0.0,0.0,0.321556,0.379832,0.244551,0.643111,0.189916,0.321556,0.0,0.189916,1.0
1,0.0,0.0,0.0,0.0,0.572929,0.0,0.338381,0.0,0.0,0.338381,0.0,0.572929,0.338381,0.734562
2,0.464997,0.0,0.464997,0.464997,0.0,0.0,0.274634,0.353642,0.0,0.274634,0.0,0.0,0.274634,0.0
