In [2]:
sentences = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

In [3]:
import re
# remove special characters
sentences = re.sub('[^A-Za-z0-9]+', ' ', sentences)
# remove 1 letter words
sentences = re.sub(r'(?:^| )\w(?:$| )', ' ', sentences).strip()
# lower all characters
sentences = sentences.lower()

In [4]:
sentences

'we are about to study the idea of computational process computational processes are abstract beings that inhabit computers as they evolve processes manipulate other abstract things called data the evolution of process is directed by pattern of rules called program people create programs to direct processes in effect we conjure the spirits of the computer with our spells'

In [5]:
words = sentences.split()
# words

In [6]:
vocab = set(words)
# vocab

In [7]:
embed_dim = 10
context_size = 2
vocab_size = len(vocab)
vocab_size

43

In [8]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [9]:
# data - [(context), target]

data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))
print(data[:5])

[(['we', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'computational'], 'idea')]


In [10]:
def cbow_model(data, total_vocab, window_size=1):
    cbow = []
    total_length = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
        cbow.append((context_word[0][0], target))
    return cbow

In [11]:
sentences

'we are about to study the idea of computational process computational processes are abstract beings that inhabit computers as they evolve processes manipulate other abstract things called data the evolution of process is directed by pattern of rules called program people create programs to direct processes in effect we conjure the spirits of the computer with our spells'

In [12]:
cbow_model(data, window_size=2, total_vocab=vocab_size)

[(['we', 'are', 'to', 'study'], ['about']),
 (['are', 'about', 'study', 'the'], ['to']),
 (['about', 'to', 'the', 'idea'], ['study']),
 (['to', 'study', 'idea', 'of'], ['the']),
 (['study', 'the', 'of', 'computational'], ['idea']),
 (['the', 'idea', 'computational', 'process'], ['of']),
 (['idea', 'of', 'process', 'computational'], ['computational']),
 (['of', 'computational', 'computational', 'processes'], ['process']),
 (['computational', 'process', 'processes', 'are'], ['computational']),
 (['process', 'computational', 'are', 'abstract'], ['processes']),
 (['computational', 'processes', 'abstract', 'beings'], ['are']),
 (['processes', 'are', 'beings', 'that'], ['abstract']),
 (['are', 'abstract', 'that', 'inhabit'], ['beings']),
 (['abstract', 'beings', 'inhabit', 'computers'], ['that']),
 (['beings', 'that', 'computers', 'as'], ['inhabit']),
 (['that', 'inhabit', 'as', 'they'], ['computers']),
 (['inhabit', 'computers', 'they', 'evolve'], ['as']),
 (['computers', 'as', 'evolve', 'p

In [19]:
import numpy as np
embeddings =  np.random.random_sample((vocab_size, embed_dim))

In [20]:
def linear(m, theta):
    w = theta
    return m.dot(w)
def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

In [21]:
def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    
    return m, n, o

In [22]:
theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

In [23]:
import numpy as np
for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        
        preds = forward(context_idxs, theta)
        break

In [14]:
context_idxs

array([22, 23, 27, 24])

In [26]:
preds[0].shape

(1, 40)