In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# sample corpus
corpus = """
The cat and her kittens, they put on their mittens,
To eat a Christmas pie. The cat and her kittens, they put on their mittens,
To eat a Christmas pie. The cat and her kittens, they put on their mittens.
"""

In [67]:
def tokenize(corpus):
    # convert to lower case
    corpus = corpus.lower()
    # remove punctuation
    corpus = corpus.replace(',', '')
    corpus = corpus.replace('.', '')
    # split into words
    words = corpus.split()
    return words

# tokenize the corpus
tokens = tokenize(corpus)
# vocabulary (set of unique words)
vocab = set(tokens)
# vocabulary size
vocab_size = len(vocab)

id2token = {i: t for i, t in enumerate(vocab)}
token2id = {t: i for i, t in enumerate(vocab)}

def encode(token, token2id):
    # create a one-hot vector
    idx = token2id[token]
    vector = [0] * vocab_size
    vector[idx] = 1
    return np.array(vector)


print("vocab size:", vocab_size)

vocab size: 15


```text
    objective:
        based on the given context try to predict the center word,
        training the neural net on this objective going to build
        a word representaion (word embeddings) hopefully.

    preparing the training data:
        [0, 0, 1, 0, ..., 0] -> 'context word(i-2)'
        [0, 1, 0, 0, ..., 0] -> 'context word(i-1)'
        [1, 0, 0, 0, ..., 0] -> 'context word(i+1)'
        [0, 0, 0, 0, ..., 1] -> 'context word(i+2)'

        compined vectors:
        [1, 1, 1, 1, ..., 1] -> 'context words' # this is going to be the input

        center word:
        [0, 0, 0, 0, ..., 0] -> 'center word' # to be predicted

        NOTE: values in vectors just for demo

```

In [57]:
def build_data(tokens, window_size=2):
    # training data consists of 
    # context words and center words
    # where the model learns to predict
    # the center word from the context words
    contexts = []
    centers = []
    for i in range(len(tokens)):
        # context per current center word
        # to save the context words and combine them
        # into single vector per center word
        context = []
        for j in range(max(0, i-window_size), min(len(tokens), i+window_size+1)):
            if i != j:
                # encode context word
                context.append(encode(tokens[j], token2id))
        # encode center word
        centers.append(encode(tokens[i], token2id))
        # combine context words into single vector
        contexts.append(np.sum(context, axis=0))
        
    return np.array(contexts), np.array(centers)

In [68]:
# build the training data
contexts, centers = build_data(tokens)
print("contexts:", contexts.shape)
print("centers:", centers.shape)

contexts: (40, 15)
centers: (40, 15)


In [None]:
class CBOWModel:
    def __init__(self, vocab_size, embedding_size):
        # initialize the weights
        self.w1 = np.random.randn(vocab_size, embedding_size)
        self.w2 = np.random.randn(embedding_size, vocab_size)
        assert self.w1.shape[1] == self.w2.shape[0]

    def train(self, contexts, centers, epochs=3, lr=0.025):
        pass

    def _forward(self, contexts, centers):
        assert contexts.shape[1] == self.w1.shape[0]
        self.a1 = np.dot(contexts, self.w1) # out shape: (N, embed_size)
        self.a2 = np.dot(self.a1, self.w2) # out shape: (N, vocab)
        self.z = self._softmax(self.a2)

    def _backward(self, lr):
        pass

    def _softmax(self, logits):
        pass

    def _cross_entropy(self, actual, prediction):
        pass