<a href="https://colab.research.google.com/github/dk-wei/nlp-algo-implementation/blob/main/BOW_Vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# `CountVectorizer`

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
  
document = ["One Geek helps Two Geeks",
            "Two Geeks help Four Geeks at GeeksforGeeks",
            "Each Geek helps many other Geeks at GeeksforGeeks"]
  
# Create a Vectorizer Object
vectorizer = CountVectorizer(binary = True,   #加了binary之后，就没有frequency了，只有出现和不出现
                             ngram_range=(1, 2),  #unigram, bigram...ngram
                             lowercase=True,
                             dtype=np.int32
                             )

vectorizer.fit(document)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int32'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [136]:
# summarize
print(vectorizer.get_feature_names())

['at', 'at geeksforgeeks', 'each', 'each geek', 'four', 'four geeks', 'geek', 'geek helps', 'geeks', 'geeks at', 'geeks help', 'geeksforgeeks', 'help', 'help four', 'helps', 'helps many', 'helps two', 'many', 'many other', 'one', 'one geek', 'other', 'other geeks', 'two', 'two geeks']


In [137]:
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)
  
# Encode the Document
vector = vectorizer.transform(document)
  
# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.shape)
print(vector.toarray())

Vocabulary:  {'one': 19, 'geek': 6, 'helps': 14, 'two': 23, 'geeks': 8, 'one geek': 20, 'geek helps': 7, 'helps two': 16, 'two geeks': 24, 'help': 12, 'four': 4, 'at': 0, 'geeksforgeeks': 11, 'geeks help': 10, 'help four': 13, 'four geeks': 5, 'geeks at': 9, 'at geeksforgeeks': 1, 'each': 2, 'many': 17, 'other': 21, 'each geek': 3, 'helps many': 15, 'many other': 18, 'other geeks': 22}
Encoded Document is:
(3, 25)
[[0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1]
 [1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1]
 [1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 0 1 1 0 0 1 1 0 0]]


In [138]:
# encode another new document
text2 = ["the puppy is a Geek"]
vector = vectorizer.transform(text2)

print(vector.toarray())

[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


可以看到，很遗憾只有geek在vocabulary中，这也是CountVectorizer很大一个弊端，未出现的token就fit不出来。

# `TfidfVectorizer`

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["One Geek helps Two Geeks",
            "Two Geeks help Four Geeks",
            "Each Geek helps many other Geeks at GeeksforGeeks"]
# create the transform
vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             smooth_idf=True,
                             use_idf=True,
                             lowercase=True,
                             dtype=np.int32
                             #binary = True
                             )
# tokenize and build vocab
vectorizer.fit(text)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int32'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [126]:
# summarize
print(vectorizer.get_feature_names())

['at', 'at geeksforgeeks', 'each', 'each geek', 'four', 'four geeks', 'geek', 'geek helps', 'geeks', 'geeks at', 'geeks help', 'geeksforgeeks', 'help', 'help four', 'helps', 'helps many', 'helps two', 'many', 'many other', 'one', 'one geek', 'other', 'other geeks', 'two', 'two geeks']


In [115]:
# summarize
print(vectorizer.vocabulary_)

{'one': 19, 'geek': 6, 'helps': 14, 'two': 23, 'geeks': 8, 'one geek': 20, 'geek helps': 7, 'helps two': 16, 'two geeks': 24, 'help': 12, 'four': 4, 'geeks help': 10, 'help four': 13, 'four geeks': 5, 'each': 2, 'many': 17, 'other': 21, 'at': 0, 'geeksforgeeks': 11, 'each geek': 3, 'helps many': 15, 'many other': 18, 'other geeks': 22, 'geeks at': 9, 'at geeksforgeeks': 1}


In [116]:
print(vectorizer.idf_)

[1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.28768207 1.28768207 1.         1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.28768207
 1.28768207]


In [117]:
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(3, 25)
[[0.         0.         0.         0.         0.         0.
  0.30443385 0.30443385 0.23642005 0.         0.         0.
  0.         0.         0.30443385 0.         0.40029393 0.
  0.         0.40029393 0.40029393 0.         0.         0.30443385
  0.30443385]
 [0.         0.         0.         0.         0.36388646 0.36388646
  0.         0.         0.42983441 0.         0.36388646 0.
  0.36388646 0.36388646 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.27674503
  0.27674503]
 [0.27645809 0.27645809 0.27645809 0.27645809 0.         0.
  0.2102535  0.2102535  0.1632806  0.27645809 0.         0.27645809
  0.         0.         0.2102535  0.27645809 0.         0.27645809
  0.27645809 0.         0.         0.27645809 0.27645809 0.
  0.        ]]


不同的doc/sent中，每个token的位置出现的不是freq，而是tf-idf值

In [118]:
# encode another new document
text2 = ["the puppy is a Geek Geek at GeeksforGeeks"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0.43381609 0.43381609 0.         0.         0.         0.
  0.65985664 0.         0.         0.         0.         0.43381609
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]]


同样的问题也是new token的值为0

# `HashingVectorizer`

In [111]:
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(text)

In [112]:
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]
