# Count Vectorizer basics

In [1]:
import sys
import re

import numpy as np
import scipy
import scipy.sparse as sp
import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline
import sklearn.feature_extraction
import sklearn.datasets

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
print(f'sklearn.__version__={sklearn.__version__}')

sklearn.__version__=1.0.2


In [3]:
X_train = ['this is a set', 'of data to train a feature vector vectorizer']

In [4]:
simple_count_vectorizer = CountVectorizer()
simple_count_vectorizer.fit(X_train)

CountVectorizer()

In [5]:
A = ["this is a feature vector for this sentence"]
x = simple_count_vectorizer.transform(A)
x

<1x10 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [9]:
# an empty sentence will produce a vector or zeros
A = ["this is a feature vector for this sentence", ""]
x = simple_count_vectorizer.transform(A)
x

<2x10 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [25]:
A = ["this is a feature vector, for this sentence", "This is another sentence"]
x = simple_count_vectorizer.transform(A)
x

<2x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [None]:
simple_count_vectorizer.vocabulary_

In [27]:
tokenizer_func = simple_count_vectorizer.build_tokenizer()
print('input string:', A[0])
print('tokenized string:', tokenizer_func(A[0]))

input string: this is a feature vector, for this sentence
tokenized string: ['this', 'is', 'feature', 'vector', 'for', 'this', 'sentence']


In [32]:
words_in_vocab = [w for w in tokenizer_func(A[0]) if w in simple_count_vectorizer.vocabulary_]
words_in_vocab

['this', 'is', 'feature', 'vector', 'for', 'this', 'sentence']

In [33]:
len(words_in_vocab), len(set(words_in_vocab))

(7, 6)

In [101]:
words_notin_vocab = [w for w in tokenizer_func(A[0]) if w not in simple_count_vectorizer.vocabulary_]
words_notin_vocab

[]

## Dataset loading

In [4]:
X = sklearn.datasets.fetch_20newsgroups()

X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target

#y_train = y_train.reshape(-1,1)
#y_test = y_test.reshape(-1,1)

In [5]:
x = X_train[0]

In [6]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [46]:
X.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Count Vectorizer hyperparameters

In [75]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(X_train)

CountVectorizer()

Note that many ngrams from the vocabulary might be quite meaningless

In [90]:
vocabulary = list(count_vectorizer.vocabulary_)
vocabulary.sort()
vocabulary[0:20]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '00000000',
 '0000000004',
 '0000000005',
 '00000000b',
 '00000001',
 '00000001b',
 '0000000667',
 '00000010',
 '00000010b',
 '00000011',
 '00000011b',
 '0000001200',
 '00000074',
 '00000093',
 '000000e5']

We can clearn the vocabulary removing features that are too rare

In [77]:
len(count_vectorizer.vocabulary_)

130107

#### `min_df`: control  minimum frequency to add a word in the vocabulary

We can control the minimum number of times a word has to be seen in order to include it in the vocabulary with `min_df`

In [87]:
count_vectorizer_min_df = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1), min_df= 10)
count_vectorizer_min_df.fit(X_train)

CountVectorizer(min_df=10)

In [88]:
len(count_vectorizer_min_df.vocabulary_)

15593

In [91]:
vocabulary = list(count_vectorizer_min_df.vocabulary_)
vocabulary.sort()
vocabulary[0:20]

['00',
 '000',
 '0002',
 '001',
 '0062',
 '01',
 '0111',
 '02',
 '0200',
 '02115',
 '02139',
 '02238',
 '03',
 '030',
 '0358',
 '0366',
 '04',
 '040',
 '0400',
 '05']

#### `max_features`: control max amount of features in the vocabulary


In [96]:
count_vectorizer_max_features = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1), max_features= 10000)
count_vectorizer_max_features.fit(X_train)

CountVectorizer(max_features=10000)

In [97]:
len(count_vectorizer_max_features.vocabulary_)

10000

In [98]:
vocabulary = list(count_vectorizer_max_features.vocabulary_)
vocabulary.sort()
vocabulary[0:20]

['00',
 '000',
 '005',
 '01',
 '02',
 '02238',
 '02p',
 '03',
 '030',
 '0358',
 '04',
 '040',
 '0400',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0b',
 '0c']

## Count Vectorizer Options


In [129]:
corpus = ["I like chocolate, a lot!",
          "I like 1080p panels",
          "I love build-in speakers",
          "Z-Edge UG24 24-inch Curved Gaming Monitor 180Hz Refresh Rate, 1ms MPRT, FHD 1080p Gaming Monitor, R1650 Curved"]


The default `CountVectorizer` might not be doing exactly what we want.

A default `CountVectorizer` using the previous `corpus` will show the following properties

- (I) Words joined by '-' such as "build-in", "24-inch" or "Z-edge" are splitted as two tokens:
- (II) Single character words are removed such as `I`
    

### Customising Vectoriser classes

- **preprocessor**: a callable that takes an entire document as input (as a single string), and returns a possibly transformed version of the document, still as an entire string. This can be used to remove HTML tags, lowercase the entire document, etc.


- **tokenizer**: a callable that takes the output from the preprocessor and splits it into tokens, then returns a list of these.


- **analyzer**: a callable that replaces the preprocessor and tokenizer. The default analyzers all call the preprocessor and tokenizer, but custom analyzers will skip this. N-gram extraction and stop word filtering take place at the analyzer level, so a custom analyzer may have to reproduce these steps.

#### `token_pattern`: control the regex to create tokens

We can control how tokens are generated with `token_pattern`

In [130]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus)
count_vectorizer.vocabulary_

{'like': 12,
 'chocolate': 5,
 'lot': 13,
 '1080p': 0,
 'panels': 17,
 'love': 14,
 'build': 4,
 'in': 10,
 'speakers': 21,
 'edge': 7,
 'ug24': 22,
 '24': 3,
 'inch': 11,
 'curved': 6,
 'gaming': 9,
 'monitor': 15,
 '180hz': 1,
 'refresh': 20,
 'rate': 19,
 '1ms': 2,
 'mprt': 16,
 'fhd': 8,
 'r1650': 18}

In [131]:
token_pattern = r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"
count_vectorizer = CountVectorizer(token_pattern=token_pattern)
count_vectorizer.fit(corpus)
count_vectorizer.vocabulary_

{'i': 10,
 'like': 11,
 'chocolate': 6,
 'a': 4,
 'lot': 12,
 '1080p': 0,
 'panels': 16,
 'love': 13,
 'build-in': 5,
 'speakers': 20,
 'z-edge': 22,
 'ug24': 21,
 '24-inch': 3,
 'curved': 7,
 'gaming': 9,
 'monitor': 14,
 '180hz': 1,
 'refresh': 19,
 'rate': 18,
 '1ms': 2,
 'mprt': 15,
 'fhd': 8,
 'r1650': 17}

#### `tokenizer`: control the tokenization process directly

We can control how tokens are generated pasing a custom `tokenizer` function.

- **Warning**: This tokenizer is really simple and generates tokens such as `lot!`.

In [137]:
def my_tokenizer(s):
    return s.split()

In [140]:
count_vectorizer = CountVectorizer(tokenizer=my_tokenizer)
count_vectorizer.fit(corpus)
count_vectorizer.vocabulary_

{'i': 10,
 'like': 11,
 'chocolate,': 6,
 'a': 4,
 'lot!': 12,
 '1080p': 0,
 'panels': 17,
 'love': 13,
 'build-in': 5,
 'speakers': 21,
 'z-edge': 23,
 'ug24': 22,
 '24-inch': 3,
 'curved': 7,
 'gaming': 9,
 'monitor': 14,
 '180hz': 1,
 'refresh': 20,
 'rate,': 19,
 '1ms': 2,
 'mprt,': 16,
 'fhd': 8,
 'monitor,': 15,
 'r1650': 18}

#### `analyzer`: control how the ngrams are generated from an input string


In [4]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
tokenizer = count_vectorizer.build_tokenizer()
tokenizer('hello there I want')

['hello', 'there', 'want']

In [5]:
s = 'bosh drill, high quality, low vibration, bosh trust'
token_pattern = r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"
count_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern = token_pattern)
analyzer = count_vectorizer.build_analyzer()
analyzer(s)

['bosh',
 'drill',
 'high',
 'quality',
 'low',
 'vibration',
 'bosh',
 'trust',
 'bosh drill',
 'drill high',
 'high quality',
 'quality low',
 'low vibration',
 'vibration bosh',
 'bosh trust']

In [6]:
def custom_analyzer(s, analyzer_from_count_vectorizer):
    """
    This analyzer can contain the same feature several times
    """
    subsentences = re.split(r'[\.\|\,\:\，]\D', s)
    result = []
    for subsentence in subsentences:
        result.append(analyzer_from_count_vectorizer(subsentence))
    return result

custom_analyzer(s, analyzer)

[['bosh', 'drill', 'bosh drill'],
 ['high', 'quality', 'high quality'],
 ['low', 'vibration', 'low vibration'],
 ['bosh', 'trust', 'bosh trust']]

In [7]:
%timeit  custom_analyzer(s, analyzer)

10.6 µs ± 43.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [8]:
s = '[great drill]: bosh drill, high quality, low vibration, bosh trust'

token_pattern = r"\w+[\-\'\%\+\.\"]?\w*[\-\'\%\+\.\"]?\w*"
count_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern = token_pattern)
token_pattern_subsentences = r'[\，\,\|\:\/,\.\-\：\–]\s'
token_pattern_subsentences_compiled = re.compile(token_pattern_subsentences)

def custom_analyzer(s, analyzer_from_count_vectorizer):
    subsentences = token_pattern_subsentences_compiled.split(s)
    result = []
    for subsentence in subsentences:
        for t in analyzer_from_count_vectorizer(subsentence):
            result.append(t)
    return result

custom_analyzer(s, analyzer)

['great',
 'drill',
 'great drill',
 'bosh',
 'drill',
 'bosh drill',
 'high',
 'quality',
 'high quality',
 'low',
 'vibration',
 'low vibration',
 'bosh',
 'trust',
 'bosh trust']

In [9]:
%timeit custom_analyzer(s, analyzer)

13.7 µs ± 53.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
