In [1]:
%load_ext lab_black

In [2]:
import re
from functools import reduce

### Preprocessing

In [3]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [4]:
formatter_pattern = re.compile(r"[^\w\s]")
formatted_corpus = [
    re.sub(formatter_pattern, "", document).lower() for document in corpus
]
formatted_corpus

['this is the first document',
 'this document is the second document',
 'and this is the third one',
 'is this the first document']

### Tokenization

In [5]:
tokenizer_pattern = re.compile(r"\b\w\w+\b")
tokenized_corpus = [
    tokenizer_pattern.findall(document) for document in formatted_corpus
]
tokenized_corpus

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['and', 'this', 'is', 'the', 'third', 'one'],
 ['is', 'this', 'the', 'first', 'document']]

### Vocabulary Alphabetized

In [6]:
def get_vocabulary_alphabetized(tokenized_corpus):

    unique_tokens = reduce(
        lambda unique_tokens, tokenized_document: unique_tokens
        | set(tokenized_document),
        tokenized_corpus,
        set(),
    )

    return {token: idx for idx, token in enumerate(sorted(unique_tokens))}

In [7]:
vocabulary = get_vocabulary_alphabetized(tokenized_corpus)
vocabulary

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}

### Hashing

In [8]:
def hash_with_vocabulary(tokenized_corpus, vocabulary):

    hashed_corpus = list()

    for tokenized_document in tokenized_corpus:

        hashed_document = list()

        for token in tokenized_document:

            hashed_token = vocabulary[token]
            hashed_document.append(hashed_token)

        hashed_corpus.append(hashed_document)

    return hashed_corpus

In [9]:
hashed_corpus = hash_with_vocabulary(tokenized_corpus, vocabulary)
hashed_corpus

[[8, 3, 6, 2, 1], [8, 1, 3, 6, 5, 1], [0, 8, 3, 6, 7, 4], [3, 8, 6, 2, 1]]

### Vectorization

In [10]:
def vectorize(hashed_corpus, max_idx):

    matrix = []

    for hashed_document in hashed_corpus:

        vector = [0 for _ in range(0, max_idx + 1)]

        for hashed_token in hashed_document:

            vector[hashed_token] += 1

        matrix.append(vector)

    return matrix

In [11]:
max_idx = max(vocabulary.values())
max_idx

8

In [12]:
X = vectorize(hashed_corpus, max_idx)
X

[[0, 1, 1, 1, 0, 0, 1, 0, 1],
 [0, 2, 0, 1, 0, 1, 1, 0, 1],
 [1, 0, 0, 1, 1, 0, 1, 1, 1],
 [0, 1, 1, 1, 0, 0, 1, 0, 1]]