# Bag of Words

## 1. Count Vectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']

In [3]:
cvector = CountVectorizer()

In [4]:
cvector.fit(corpus) # .fit : 학습의 의미
cvector.transform(corpus).toarray() # .transform : 변환의 의미, 내용을 보기위해선 .toarray*를 해줘야 함

array([[1, 1, 2, 1, 2, 1]])

In [6]:
cvector.vocabulary_ # .vocabulary_ 학습한 내용 출력

{'because': 0, 'know': 1, 'love': 2, 'want': 3, 'you': 4, 'your': 5}

- fit() 학습후
- predict() 예측 or transform() 변환

In [8]:
output = cvector.fit_transform(corpus).toarray()  # 학습과 변환 출력을 동시에
output

array([[1, 1, 2, 1, 2, 1]])

In [11]:
output.shape #output으로 모양을 보면 2차원임

(1, 6)

# 불용어를 제거한 BoW

In [12]:
# 사용자 정의

text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [13]:
# Scikit-Learn 제공

text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# NLTK
from nltk.corpus import stopwords

text=["Family is not an important thing. It's everything."]
sw = stopwords.words("english")
vect = CountVectorizer(stop_words =sw)
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [16]:
# 인덱스 변경
def get_word(index, voca):
  for key, value in voca.items():
    if value == index:
      return key


In [18]:
get_word(2, vect.vocabulary_)

'important'

- N-gram

In [19]:
text1 = ['Machine learning is fun and not boring.']
text2 = ['Machine is boring and learnig is not fun.']

In [21]:
vect = CountVectorizer()
print(vect.fit_transform(text1).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [22]:
vect = CountVectorizer()
print(vect.fit_transform(text2).toarray()) 
print(vect.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'is': 3, 'boring': 1, 'and': 0, 'learnig': 4, 'not': 6, 'fun': 2}


In [23]:
# N-gram range(1, 2) : 유니그램, 바이그램 허용
vect = CountVectorizer(ngram_range=(1, 2))
print(vect.fit_transform(text1).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 9, 'learning': 7, 'is': 5, 'fun': 3, 'and': 0, 'not': 11, 'boring': 2, 'machine learning': 10, 'learning is': 8, 'is fun': 6, 'fun and': 4, 'and not': 1, 'not boring': 12}


In [24]:
print(vect.fit_transform(text2).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'is': 5, 'boring': 2, 'and': 0, 'learnig': 8, 'not': 12, 'fun': 4, 'machine is': 11, 'is boring': 6, 'boring and': 3, 'and learnig': 1, 'learnig is': 9, 'is not': 7, 'not fun': 13}


In [25]:
# N-gram range(1, 3) : 유니그램, 바이그램, 트리그램 허용 stop words english
vect = CountVectorizer(ngram_range=(1, 3), stop_words='english')
print(vect.fit_transform(text1).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1 1 1 1 1]]
{'machine': 6, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 7, 'learning fun': 4, 'fun boring': 2, 'machine learning fun': 8, 'learning fun boring': 5}


- Hyper parameter

In [27]:
vect.get_params()   # CountVectorizer 설정 확인

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

- DTM

In [29]:
corpus = [
          'you know I want your love',
          'I like you',
          'wat should I do'
]     # 여러개의 문장일 경우 array가 나눠짐, 문장별로 나오는 단어를 총 단어그룹에서 각각 표현함
vect = CountVectorizer()
print(vect.fit_transform(corpus).toarray()) 
print(vect.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'wat': 6, 'should': 4, 'do': 0}


## TF-IDF Vectorizer
- 동일문서내에 반복되는 단어일 수록 중요도 점수가 높음
- 다른 문서와 단어가 겹칠수록 중요도 점수가 낮아짐

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()

print(tvect.fit_transform(corpus).toarray().round(2)) 
print(tvect.vocabulary_)

[[0.   0.47 0.   0.47 0.   0.47 0.   0.36 0.47]
 [0.   0.   0.8  0.   0.   0.   0.   0.61 0.  ]
 [0.58 0.   0.   0.   0.58 0.   0.58 0.   0.  ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'wat': 6, 'should': 4, 'do': 0}


In [33]:
tvect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

print(tvect.fit_transform(corpus).toarray().round(2)) 
print(tvect.vocabulary_)

[[0.45 0.45 0.   0.45 0.45 0.45 0.  ]
 [0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.  ]]
{'know': 0, 'want': 4, 'love': 3, 'know want': 1, 'want love': 5, 'like': 2, 'wat': 6}


In [34]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}