In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [36]:
corpus = ['this is the first document',
          'this is the second document or second sentence',
          'and this is the third one',
          'is this the first document or document one']

In [37]:
#instantiate CountVectorizer
cv = CountVectorizer()
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [38]:
#generate word counts
word_count_vect = cv.fit_transform(corpus)
#view non-zero feature positions in the sparse matrix
print(word_count_vect)
#The preceeding output represents the total count for each (x,y) pair
# x represents a document and y represents a specific word/feature
# the values is the number of times y occurs in x

  (0, 10)	1
  (0, 3)	1
  (0, 8)	1
  (0, 2)	1
  (0, 1)	1
  (1, 10)	1
  (1, 3)	1
  (1, 8)	1
  (1, 1)	1
  (1, 6)	2
  (1, 5)	1
  (1, 7)	1
  (2, 10)	1
  (2, 3)	1
  (2, 8)	1
  (2, 0)	1
  (2, 9)	1
  (2, 4)	1
  (3, 10)	1
  (3, 3)	1
  (3, 8)	1
  (3, 2)	1
  (3, 1)	2
  (3, 5)	1
  (3, 4)	1


In [39]:
#shape
print(word_count_vect.shape) #4 docs, 11 features

(4, 11)


In [40]:
#view dense representation
cv_matrix = word_count_vect.toarray()
cv_matrix

array([[0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 1, 2, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1],
       [0, 2, 1, 1, 1, 1, 0, 0, 1, 0, 1]])

In [41]:
#get all unique words in the corpus
vocab = cv.get_feature_names()
vocab

['and',
 'document',
 'first',
 'is',
 'one',
 'or',
 'second',
 'sentence',
 'the',
 'third',
 'this']

In [42]:
#bag of words model based document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,and,document,first,is,one,or,second,sentence,the,third,this
0,0,1,1,1,0,0,0,0,1,0,1
1,0,1,0,1,0,1,2,1,1,0,1
2,1,0,0,1,1,0,0,0,1,1,1
3,0,2,1,1,1,1,0,0,1,0,1


In [43]:
#TF-IDF using TfidfTransformer
tf_idf = TfidfTransformer(use_idf=True)
tf_idf_matrix = tf_idf.fit_transform(cv_matrix)
tf_idf_matrix = tf_idf_matrix.toarray()
tf_idf_matrix

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.        , 0.        , 0.38408524, 0.        ,
        0.38408524],
       [0.        , 0.24394892, 0.        , 0.19944423, 0.        ,
        0.30132545, 0.76438624, 0.38219312, 0.19944423, 0.        ,
        0.19944423],
       [0.53927767, 0.        , 0.        , 0.28141746, 0.42517271,
        0.        , 0.        , 0.        , 0.28141746, 0.53927767,
        0.28141746],
       [0.        , 0.61480604, 0.37970389, 0.25132211, 0.37970389,
        0.37970389, 0.        , 0.        , 0.25132211, 0.        ,
        0.25132211]])

In [44]:
pd.DataFrame(np.round(tf_idf_matrix,4), columns=vocab)

Unnamed: 0,and,document,first,is,one,or,second,sentence,the,third,this
0,0.0,0.4698,0.5803,0.3841,0.0,0.0,0.0,0.0,0.3841,0.0,0.3841
1,0.0,0.2439,0.0,0.1994,0.0,0.3013,0.7644,0.3822,0.1994,0.0,0.1994
2,0.5393,0.0,0.0,0.2814,0.4252,0.0,0.0,0.0,0.2814,0.5393,0.2814
3,0.0,0.6148,0.3797,0.2513,0.3797,0.3797,0.0,0.0,0.2513,0.0,0.2513
