In [1]:
%load_ext lab_black

In [2]:
import re

### Preprocessing

In [3]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [4]:
formatter_pattern = re.compile(r"[^\w\s]")
formatted_corpus = [
    re.sub(formatter_pattern, "", document).lower() for document in corpus
]
formatted_corpus

['this is the first document',
 'this document is the second document',
 'and this is the third one',
 'is this the first document']

### Tokenization

In [5]:
tokenizer_pattern = re.compile(r"\b\w\w+\b")
tokenized_corpus = [
    tokenizer_pattern.findall(document) for document in formatted_corpus
]
tokenized_corpus

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['and', 'this', 'is', 'the', 'third', 'one'],
 ['is', 'this', 'the', 'first', 'document']]

### Vocabulary

In [6]:
def get_vocabulary(tokenized_corpus):

    vocabulary = dict()
    idx = 0
    for tokenized_document in tokenized_corpus:
        for token in tokenized_document:
            if token not in vocabulary:
                vocabulary[token] = idx
                idx += 1

    return vocabulary

In [7]:
vocabulary = get_vocabulary(tokenized_corpus)
vocabulary

{'this': 0,
 'is': 1,
 'the': 2,
 'first': 3,
 'document': 4,
 'second': 5,
 'and': 6,
 'third': 7,
 'one': 8}

### Hashing

In [8]:
def hash_with_vocabulary(tokenized_corpus, vocabulary):

    hashed_corpus = list()

    for tokenized_document in tokenized_corpus:

        hashed_document = list()

        for token in tokenized_document:

            hashed_token = vocabulary[token]
            hashed_document.append(hashed_token)

        hashed_corpus.append(hashed_document)

    return hashed_corpus

In [9]:
hashed_corpus = hash_with_vocabulary(tokenized_corpus, vocabulary)
hashed_corpus

[[0, 1, 2, 3, 4], [0, 4, 1, 2, 5, 4], [6, 0, 1, 2, 7, 8], [1, 0, 2, 3, 4]]

### Vectorization

In [10]:
def vectorize(hashed_corpus, max_idx):

    matrix = []

    for hashed_document in hashed_corpus:

        vector = [0 for _ in range(0, max_idx + 1)]

        for hashed_token in hashed_document:

            vector[hashed_token] += 1

        matrix.append(vector)

    return matrix

In [11]:
max_idx = max(vocabulary.values())
max_idx

8

In [12]:
X = vectorize(hashed_corpus, max_idx)
X

[[1, 1, 1, 1, 1, 0, 0, 0, 0],
 [1, 1, 1, 0, 2, 1, 0, 0, 0],
 [1, 1, 1, 0, 0, 0, 1, 1, 1],
 [1, 1, 1, 1, 1, 0, 0, 0, 0]]