## Bag of Words

### 1. Count Vectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['You know I want you love. Because I love you.']

In [5]:
cv = CountVectorizer()
output = cv.fit_transform(corpus)
output.toarray()

array([[1, 1, 2, 1, 3]])

In [6]:
cv.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'love': 2, 'because': 0}

- 불용어를 제거한 BoW

In [7]:
# 자체 제거
text = ["Family is not an important thing. It's everything."]
cv = CountVectorizer(stop_words = ['the', 'a', 'an', 'is'])
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1]]
{'family': 1, 'not': 4, 'important': 2, 'thing': 5, 'it': 3, 'everything': 0}


- Scikit-Learn에서 제공하는 불용어 사용

In [8]:
cv = CountVectorizer(stop_words = 'english')
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


- NLTK에서 제공하는 불용어 사용

In [9]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

In [10]:
cv = CountVectorizer(stop_words = sw)
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [17]:
def get_word(index, voca) :
    for key, value in voca.items() :
        if value == index :
            return key

In [18]:
get_word(3, cv.vocabulary_)

'thing'

- N-gram

In [19]:
text = ["Machine learning is fun and not boring"]
cv = CountVectorizer()
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [20]:
cv = CountVectorizer(ngram_range = (1, 2))
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 9, 'learning': 7, 'is': 5, 'fun': 3, 'and': 0, 'not': 11, 'boring': 2, 'machine learning': 10, 'learning is': 8, 'is fun': 6, 'fun and': 4, 'and not': 1, 'not boring': 12}


In [21]:
cv = CountVectorizer(ngram_range = (1, 3), stop_words = 'english')
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1 1 1 1]]
{'machine': 6, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 7, 'learning fun': 4, 'fun boring': 2, 'machine learning fun': 8, 'learning fun boring': 5}


- Hyper Parameter

In [23]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

### 2. TF-IDF Vectorizer

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "you know i want your love",
    "i like you",
    "what should i do"
]

In [26]:
cv = CountVectorizer()
print(cv.fit_transform(corpus).toarray())
print(cv.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [27]:
tv = TfidfVectorizer()
print(tv.fit_transform(corpus).toarray())
print(tv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [28]:
tv = TfidfVectorizer(ngram_range = (1, 2), stop_words = 'english')
print(tv.fit_transform(corpus).toarray())
print(tv.vocabulary_)

[[0.4472136 0.4472136 0.        0.4472136 0.4472136 0.4472136]
 [0.        0.        1.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.       ]]
{'know': 0, 'want': 4, 'love': 3, 'know want': 1, 'want love': 5, 'like': 2}


In [29]:
tv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}