In [1]:
# import CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# sample corpus

corpus = [ "The early bird gets the worm.",
           "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb."]
corpus

['The early bird gets the worm.',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb.']

In [3]:
# create

cv = CountVectorizer()
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:
# fit the corpus

cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
# what are the features that were found?

cv.get_feature_names()

['and',
 'bird',
 'bomb',
 'dr',
 'early',
 'gets',
 'how',
 'learned',
 'love',
 'or',
 'stop',
 'strangelove',
 'the',
 'to',
 'worm',
 'worrying']

In [6]:
# vocabulary is the features and their encoded value

cv.vocabulary_

{'the': 12,
 'early': 4,
 'bird': 1,
 'gets': 5,
 'worm': 14,
 'dr': 3,
 'strangelove': 11,
 'or': 9,
 'how': 6,
 'learned': 7,
 'to': 13,
 'stop': 10,
 'worrying': 15,
 'and': 0,
 'love': 8,
 'bomb': 2}

In [7]:
# get encoding for a specific feature

cv.vocabulary_.get("strangelove")

11

In [8]:
# now let's create the bag-of-words encoding

tv = cv.transform(corpus)
tv

<2x16 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [9]:
# shape of the encoding

tv.shape

(2, 16)

In [10]:
# map of doc/feature id/frequency

print(tv)

(0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (0, 12)	2
  (0, 14)	1
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 15)	1


In [11]:
# here is the encoding

tv.toarray()

array([[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]], dtype=int64)