In [6]:
from sklearn import feature_extraction

In [11]:


corpus = [
    'Convert a collection of text documents to a matrix of token occurrences',
    'It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.',
    'This text vectorizer implementation uses the hashing trick to find the token string name to feature integer index mapping.',
    'This strategy has several advantages:',
    'it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory',
    'it is fast to pickle and un-pickle as it holds no state besides the constructor parameters',
    'it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.'
]

print('Processing corpus: {} documents'.format(len(corpus)))

print('Count Vectorizer:\n')
vectorizer = feature_extraction.text.CountVectorizer()
X = vectorizer.fit_transform(corpus)
# Count Vectorizer stores a dictionary: a number per word
print(vectorizer.vocabulary_)
print('Resulting matrix has {} data points and {} features.\n'.format(
    X.shape[0], X.shape[1]))
print('Document 1: \n{}'.format(X[0].toarray()))
# as the number of words increase, you need a bigger and bigger dictionary!


print('Hashing Vectorizer:\n')

# norm=None means we don't normalize the values
# alternative_sign=False means that we don't alternate the value's signs to
#   conserve any mathematical properties
vectorizer = feature_extraction.text.HashingVectorizer(
    norm=None, alternate_sign=False)
X = vectorizer.transform(corpus)  # not fit_transform

print('Resulting matrix has {} data points and {} features.\n'.format(
    X.shape[0], X.shape[1]))

# > Resulting matrix has 7 data points and 1048576 features.

print('Document 1: \n{}'.format(X[0]))

# Document 1: 
#   (0, 22468)	0.2886751345948129
#   (0, 124863)	-0.2886751345948129
#   (0, 164975)	-0.2886751345948129
#   (0, 174171)	0.2886751345948129
#   (0, 264705)	0.2886751345948129
#   (0, 479532)	0.5773502691896258
#   (0, 548700)	-0.2886751345948129
#   (0, 676585)	-0.2886751345948129
#   (0, 741852)	-0.2886751345948129
#   Read the above as:
#   (document_index, feature_index) 


Processing corpus: 7 documents
Count Vectorizer:

{'convert': 10, 'collection': 7, 'of': 49, 'text': 69, 'documents': 14, 'to': 73, 'matrix': 40, 'token': 74, 'occurrences': 48, 'it': 34, 'turns': 76, 'into': 32, 'scipy': 60, 'sparse': 62, 'holding': 24, 'occurrence': 47, 'counts': 11, 'or': 51, 'binary': 5, 'information': 30, 'possibly': 57, 'normalized': 46, 'as': 2, 'frequencies': 21, 'if': 26, 'norm': 45, 'l1': 35, 'projected': 58, 'on': 50, 'the': 70, 'euclidean': 16, 'unit': 78, 'sphere': 63, 'l2': 36, 'this': 72, 'vectorizer': 81, 'implementation': 27, 'uses': 80, 'hashing': 23, 'trick': 75, 'find': 19, 'string': 68, 'name': 42, 'feature': 18, 'integer': 31, 'index': 29, 'mapping': 39, 'strategy': 66, 'has': 22, 'several': 61, 'advantages': 0, 'is': 33, 'very': 82, 'low': 38, 'memory': 41, 'scalable': 59, 'large': 37, 'datasets': 12, 'there': 71, 'no': 44, 'need': 43, 'store': 65, 'vocabulary': 83, 'dictionary': 13, 'in': 28, 'fast': 17, 'pickle': 55, 'and': 1, 'un': 77, 'holds'