In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

## Count specific terms (*vocabulary*) in documents

In [4]:
cv1 = CountVectorizer(vocabulary=['hot', 'cold', 'winter'])

In [9]:
# `fit_transform` returns sparse matrix
# `toarray`       turns it into numpy array
term_frequency_per_doc = cv1.fit_transform(['hot hot hot',
                                            'i am cold at winter',
                                            'i am cold and hot']).toarray()
term_frequency_per_doc

array([[3, 0, 0],
       [0, 1, 1],
       [1, 1, 0]])

In [10]:
df1 = pd.DataFrame(term_frequency_per_doc, columns=cv1.get_feature_names())
df1

Unnamed: 0,hot,cold,winter
0,3,0,0
1,0,1,1
2,1,1,0


## Find the most frequent words in text

In [11]:
text = 'one one four two two two three'

In [12]:
cv2 = CountVectorizer()

In [13]:
tf = cv2.fit_transform([text]).toarray()
tf

array([[1, 2, 1, 3]], dtype=int64)

In [14]:
# display words we've got in the text
feature_names = cv2.get_feature_names()
feature_names

['four', 'one', 'three', 'two']

In [15]:
count = pd.DataFrame(tf, index=['word freq'], columns=feature_names)
count

Unnamed: 0,four,one,three,two
word freq,1,2,1,3


In [16]:
# transpose and choose column for sorting
count.T.sort_values('word freq', ascending=False)

Unnamed: 0,word freq
two,3
one,2
four,1
three,1
