In [4]:
# Transforming words into feature vectors 

#bag of words model - allows us to represent text as numerical feature vectors 
# 1. Create a vocab of unique tokens (words) from the entire set of documents
# 2. Construct a feature vector from each document that contains the counts of how 
# often each word occurs in a particular document 

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer 
count = CountVectorizer()
docs = np.array(
    ['The sun is shining', 
     'The weather is sweet', 
     'The sun is shining, the weather is sweet,'
     'and one and one is two']
     )
bag = count.fit_transform(docs)
# transforming the three sentences into sparse feature vectors

print(count.vocabulary_)


{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [5]:
# print the actual feature vectors
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [6]:
#Assessing word relevancy via term frequency-inverse document frequency

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
# l2 = l2-normalization, which returns a vector of length 1

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]
