In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
corpus = ['This is the first document',
          'This is the second document',
          'Third document. document number three.',
          'Number four. To repeat, number four']

In [13]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)
bag_of_words

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [14]:
print(bag_of_words)

  (0, 0)	1
  (0, 1)	1
  (0, 7)	1
  (0, 3)	1
  (0, 9)	1
  (1, 6)	1
  (1, 0)	1
  (1, 7)	1
  (1, 3)	1
  (1, 9)	1
  (2, 10)	1
  (2, 4)	1
  (2, 8)	1
  (2, 0)	2
  (3, 5)	1
  (3, 11)	1
  (3, 2)	2
  (3, 4)	2


In [15]:
# Can get the id of a word using vectorizer.vocabulary_.get
print("Here's the vocabulary: ", vectorizer.vocabulary_)
print("This is the id of the word 'document': ", vectorizer.vocabulary_.get('document'))

Here's the vocabulary:  {'this': 9, 'is': 3, 'the': 7, 'first': 1, 'document': 0, 'second': 6, 'third': 8, 'number': 4, 'three': 10, 'four': 2, 'to': 11, 'repeat': 5}
This is the id of the word 'document':  0


In [16]:
import pandas as pd

In [17]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,1,1,0,1,0,0,0,1,0,1,0,0
1,1,0,0,1,0,0,1,1,0,1,0,0
2,2,0,0,0,1,0,0,0,1,0,1,0
3,0,0,2,0,2,1,0,0,0,0,0,1


In [18]:
# Using TF-IDF weights
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)

In [19]:
print(bag_of_words)

  (0, 9)	0.43584673254990375
  (0, 3)	0.43584673254990375
  (0, 7)	0.43584673254990375
  (0, 1)	0.5528163151092931
  (0, 0)	0.3528554929793508
  (1, 9)	0.43584673254990375
  (1, 3)	0.43584673254990375
  (1, 7)	0.43584673254990375
  (1, 0)	0.3528554929793508
  (1, 6)	0.5528163151092931
  (2, 0)	0.6191395067937654
  (2, 8)	0.4850008395708102
  (2, 4)	0.3823802326982809
  (2, 10)	0.4850008395708102
  (3, 4)	0.5412799489419371
  (3, 2)	0.6865449812276998
  (3, 11)	0.3432724906138499
  (3, 5)	0.3432724906138499


In [20]:
pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,document,first,four,is,number,repeat,second,the,third,this,three,to
0,0.352855,0.552816,0.0,0.435847,0.0,0.0,0.0,0.435847,0.0,0.435847,0.0,0.0
1,0.352855,0.0,0.0,0.435847,0.0,0.0,0.552816,0.435847,0.0,0.435847,0.0,0.0
2,0.61914,0.0,0.0,0.0,0.38238,0.0,0.0,0.0,0.485001,0.0,0.485001,0.0
3,0.0,0.0,0.686545,0.0,0.54128,0.343272,0.0,0.0,0.0,0.0,0.0,0.343272


In [21]:
# Using the hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=8)  # n_features is the number of hash buckets
feature_vector = vectorizer.fit_transform(corpus)
print(feature_vector)

  (0, 0)	-0.8944271909999159
  (0, 5)	0.4472135954999579
  (0, 6)	0.0
  (1, 0)	-0.5773502691896258
  (1, 3)	0.5773502691896258
  (1, 5)	0.5773502691896258
  (1, 6)	0.0
  (2, 0)	-0.7559289460184544
  (2, 3)	0.3779644730092272
  (2, 5)	0.3779644730092272
  (2, 7)	0.3779644730092272
  (3, 0)	0.31622776601683794
  (3, 3)	0.31622776601683794
  (3, 5)	0.6324555320336759
  (3, 7)	0.6324555320336759
