# ch7.TF-IDF

## TF-IDF 이론

여러개의 문서에서 그 중에서 중요한 단어들은 각각의 문서에서만 자주 나오고, 모든 문서에서는 잘 나타나지 않는 경향이 있다.

**TF(빈도수) * IDF(역문서 빈도수) = TF-IDF (이것을 곱한 값이 클수록 중요하다는 경향이 있다)**

**TF (문서 j, 단어 i)** = 문서j에서 단어i가 나타나는 빈도수 / 문서 j 에서 모든 단어의 빈도수

**IDF (문서 j, 단어 i)** = log(전체문서의 수/ 해당 단어i가 포함된 문서의 수)

## TF-IDF (영어 문장)
주요 단어 추출 알고리즘

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # TfidfVectorizer: 가중치를 부여하는 함수

corpus = [
    'Data Science is an overlap between Arts and Science',
    'Generally, Arts graduates are right-brained and Science graduates are left-brained',
    'Excelling in both Arts and Sciences at a time becomes difficult',
    'Natural Language Processing is a part of Data Science'
]

In [4]:
tfidf_model = TfidfVectorizer() # 모델 선언
print(tfidf_model.fit_transform(corpus).todense()) # corpus 대입시킨후 tfidf계산 -> 행렬 압축

# 문장이 숫자로 구성 = 워드/단어 사전의 크기

[[0.38669033 0.24681943 0.         0.24681943 0.         0.
  0.38669033 0.         0.         0.3048711  0.         0.
  0.         0.         0.         0.3048711  0.         0.
  0.         0.         0.38669033 0.         0.         0.
  0.49363886 0.         0.        ]
 [0.         0.15847499 0.4965634  0.15847499 0.         0.
  0.         0.         0.4965634  0.         0.         0.
  0.2482817  0.4965634  0.         0.         0.         0.2482817
  0.         0.         0.         0.         0.         0.2482817
  0.15847499 0.         0.        ]
 [0.         0.21498556 0.         0.21498556 0.33681642 0.33681642
  0.         0.33681642 0.         0.         0.33681642 0.33681642
  0.         0.         0.33681642 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.33681642 0.33681642]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30571917 0.         0.
  0.         0.       

In [5]:
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(corpus).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()
# sorted 해서 알파벳 순서대로

Unnamed: 0,an,and,are,arts,at,becomes,between,both,brained,data,...,left,natural,of,overlap,part,processing,right,science,sciences,time
0,0.38669,0.246819,0.0,0.246819,0.0,0.0,0.38669,0.0,0.0,0.304871,...,0.0,0.0,0.0,0.38669,0.0,0.0,0.0,0.493639,0.0,0.0
1,0.0,0.158475,0.496563,0.158475,0.0,0.0,0.0,0.0,0.496563,0.0,...,0.248282,0.0,0.0,0.0,0.0,0.0,0.248282,0.158475,0.0,0.0
2,0.0,0.214986,0.0,0.214986,0.336816,0.336816,0.0,0.336816,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.336816,0.336816
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305719,...,0.0,0.387766,0.387766,0.0,0.387766,0.387766,0.0,0.247506,0.0,0.0


In [6]:
tfidf_df.columns
# 열만 출력 -> 단어 사전

Index(['an', 'and', 'are', 'arts', 'at', 'becomes', 'between', 'both',
       'brained', 'data', 'difficult', 'excelling', 'generally', 'graduates',
       'in', 'is', 'language', 'left', 'natural', 'of', 'overlap', 'part',
       'processing', 'right', 'science', 'sciences', 'time'],
      dtype='object')

In [7]:
tfidf_model_small = TfidfVectorizer(max_features=10) # 너무 많으니 단어 사전중에서 가장 큰 열개로 압축
tfidf_df_small = pd.DataFrame(tfidf_model_small.fit_transform(corpus).todense())
tfidf_df_small.columns = sorted(tfidf_model_small.vocabulary_)
tfidf_df_small.head()
# 첫번째 문장은 어떠한 단어가 주요한 역할을 하고, 
# 두번째 문장은 어떠한 단어가 주요한 역할을 하는지 그 weight을 자동적으로 계산할수 있다
# 단어의 중요도 확인 

Unnamed: 0,and,are,arts,brained,data,graduates,is,processing,right,science
0,0.332385,0.0,0.332385,0.0,0.410562,0.0,0.410562,0.0,0.0,0.66477
1,0.169251,0.53033,0.169251,0.53033,0.0,0.53033,0.0,0.0,0.265165,0.169251
2,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.484263,0.0,0.484263,0.614226,0.0,0.392053


## TFIDF_신문