In [1]:
# import CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# create the corpus

corpus = [ "The early bird gets the worm.",
           "In the heat of the moment",
           "Cross that bridge when you come to it",
           "Don't put all your eggs in one basket",
           "The ball is in your court",
           "Your guess is as good as mine"]
corpus

['The early bird gets the worm.',
 'In the heat of the moment',
 'Cross that bridge when you come to it',
 "Don't put all your eggs in one basket",
 'The ball is in your court',
 'Your guess is as good as mine']

In [3]:
# create count vectorizer and fit corpus and get encoding
count_vectorizer = CountVectorizer()
freq_term_matrix = count_vectorizer.fit_transform(corpus)

In [4]:
# examine the vocabulary

count_vectorizer.vocabulary_

{'the': 25,
 'early': 10,
 'bird': 4,
 'gets': 12,
 'worm': 28,
 'in': 16,
 'heat': 15,
 'of': 21,
 'moment': 20,
 'cross': 8,
 'that': 24,
 'bridge': 5,
 'when': 27,
 'you': 29,
 'come': 6,
 'to': 26,
 'it': 18,
 'don': 9,
 'put': 23,
 'all': 0,
 'your': 30,
 'eggs': 11,
 'one': 22,
 'basket': 3,
 'ball': 2,
 'is': 17,
 'court': 7,
 'guess': 14,
 'as': 1,
 'good': 13,
 'mine': 19}

In [5]:
freq_term_matrix.shape

(6, 31)

In [6]:
# and here is the bag-of-words encoding

freq_term_matrix.toarray()

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 2, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [7]:
# now let's transform this to tf-idf

from sklearn.feature_extraction.text import TfidfTransformer

In [8]:
# create TF-IDF transformer

tfidf_transformer = TfidfTransformer()

In [9]:
# fit and transform the 

tfidf_vector = tfidf_transformer.fit_transform(freq_term_matrix)

In [10]:
tfidf_vector.shape

(6, 31)

In [11]:
tfidf_vector.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.41109519,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.41109519, 0.        , 0.41109519, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.56921261, 0.        , 0.        , 0.41109519, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43047171, 0.29802091, 0.        , 0.        , 0.        ,
        0.43047171, 0.43047171, 0.        , 0.        , 0.        ,
        0.59604182, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.35355339, 0.35355339, 0.        , 0.35355339, 0.        ,
      

In [12]:
# this class is equivalent to CountVectorizer + TdifTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [13]:
tfidf_vector = tfidf_vectorizer.fit_transform(corpus)
tfidf_vectorizer.vocabulary_

{'the': 25,
 'early': 10,
 'bird': 4,
 'gets': 12,
 'worm': 28,
 'in': 16,
 'heat': 15,
 'of': 21,
 'moment': 20,
 'cross': 8,
 'that': 24,
 'bridge': 5,
 'when': 27,
 'you': 29,
 'come': 6,
 'to': 26,
 'it': 18,
 'don': 9,
 'put': 23,
 'all': 0,
 'your': 30,
 'eggs': 11,
 'one': 22,
 'basket': 3,
 'ball': 2,
 'is': 17,
 'court': 7,
 'guess': 14,
 'as': 1,
 'good': 13,
 'mine': 19}

In [14]:
# same shape as the one earlier

tfidf_vector.shape

(6, 31)

In [15]:
# idf values for the vocabulary

tfidf_vectorizer.idf_

array([2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       2.25276297, 1.55961579, 1.84729786, 2.25276297, 2.25276297,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       1.55961579, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       1.55961579])

In [16]:
# we can see word/value with a zip

word_idf = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))
word_idf

{'all': 2.252762968495368,
 'as': 2.252762968495368,
 'ball': 2.252762968495368,
 'basket': 2.252762968495368,
 'bird': 2.252762968495368,
 'bridge': 2.252762968495368,
 'come': 2.252762968495368,
 'court': 2.252762968495368,
 'cross': 2.252762968495368,
 'don': 2.252762968495368,
 'early': 2.252762968495368,
 'eggs': 2.252762968495368,
 'gets': 2.252762968495368,
 'good': 2.252762968495368,
 'guess': 2.252762968495368,
 'heat': 2.252762968495368,
 'in': 1.5596157879354227,
 'is': 1.8472978603872037,
 'it': 2.252762968495368,
 'mine': 2.252762968495368,
 'moment': 2.252762968495368,
 'of': 2.252762968495368,
 'one': 2.252762968495368,
 'put': 2.252762968495368,
 'that': 2.252762968495368,
 'the': 1.5596157879354227,
 'to': 2.252762968495368,
 'when': 2.252762968495368,
 'worm': 2.252762968495368,
 'you': 2.252762968495368,
 'your': 1.5596157879354227}

In [17]:
# sorted from lowest tf-idf importance to highest

sorted(word_idf.items(), key = lambda kv:(kv[1], kv[0]))

[('in', 1.5596157879354227),
 ('the', 1.5596157879354227),
 ('your', 1.5596157879354227),
 ('is', 1.8472978603872037),
 ('all', 2.252762968495368),
 ('as', 2.252762968495368),
 ('ball', 2.252762968495368),
 ('basket', 2.252762968495368),
 ('bird', 2.252762968495368),
 ('bridge', 2.252762968495368),
 ('come', 2.252762968495368),
 ('court', 2.252762968495368),
 ('cross', 2.252762968495368),
 ('don', 2.252762968495368),
 ('early', 2.252762968495368),
 ('eggs', 2.252762968495368),
 ('gets', 2.252762968495368),
 ('good', 2.252762968495368),
 ('guess', 2.252762968495368),
 ('heat', 2.252762968495368),
 ('it', 2.252762968495368),
 ('mine', 2.252762968495368),
 ('moment', 2.252762968495368),
 ('of', 2.252762968495368),
 ('one', 2.252762968495368),
 ('put', 2.252762968495368),
 ('that', 2.252762968495368),
 ('to', 2.252762968495368),
 ('when', 2.252762968495368),
 ('worm', 2.252762968495368),
 ('you', 2.252762968495368)]

In [18]:
tfidf_vector.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.41109519,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.41109519, 0.        , 0.41109519, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.56921261, 0.        , 0.        , 0.41109519, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43047171, 0.29802091, 0.        , 0.        , 0.        ,
        0.43047171, 0.43047171, 0.        , 0.        , 0.        ,
        0.59604182, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.35355339, 0.35355339, 0.        , 0.35355339, 0.        ,
      