In [None]:
import numpy as np
import pandas as pd


import re

import matplotlib.pyplot as plt


import nltk
from nltk.corpus import stopwords

In [None]:
with open('data/corpus.txt', 'r') as file:
    content = file.readlines()

In [None]:
def create_vocabulary(training_data):
    all_words = " ".join(training_data).lower()
    all_words = all_words.replace('\ufeff', '')
    all_words = all_words.replace('.', '')
    all_words = all_words.split(' ')
    vocab = list(set(all_words))
    vocab.sort()
    return vocab

In [None]:
def one_hot(word, vocab, vocab_size):
    one_hot = [0] * vocab_size
    pos = vocab.index(word)
    one_hot[pos] = 1
    one_hot = np.array(one_hot)
    return one_hot

In [None]:
def create_vector_word_map(vocab, vocab_size):
    vec2word = {str(one_hot(word, vocab, vocab_size)): word \
                      for word in vocab}
    return  vec2word

In [None]:
training_data = content


vocab = create_vocabulary(training_data)
vocab_size = len(vocab)

for word in vocab:
    print(f'{word}:{" "*(6-len(word))} {one_hot(word, vocab, vocab_size)}')

In [None]:
def encode_training_data(training_data, vocab, vocab_size, window_size):
    encoded_training_data = []
    for sentence in training_data:
        tokens = re.sub(r'[^\w\s]', '', sentence).lower().split(' ')
        for word_pos, word in enumerate(tokens):
            center_word = one_hot(word, vocab, vocab_size)
            outside_words = []
            for outside_pos in range(word_pos-window_size, 
                                     word_pos+window_size+1):
                if (outside_pos >= 0) and (outside_pos < len(tokens)) \
                and (outside_pos != word_pos):
                    outside_words.append(one_hot(tokens[outside_pos],
                                                 vocab,
                                                 vocab_size))
            encoded_training_data.append([center_word, outside_words])
    return encoded_training_data


In [None]:
training_data = encode_training_data(
    training_data = training_data, vocab = vocab,
    vocab_size = vocab_size, window_size = 2
)

In [None]:
def softmax(x):
    probs = np.exp(x) / np.sum(np.exp(x))
    return probs

In [None]:
def calculate_loss(outside_words, y_pred):
    combined_outside_words = np.sum(outside_words, axis = 0)
    components = np.multiply(combined_outside_words, y_pred)
    non_zero_idx = np.where(components != 0)[0]
    non_zero_components = components[non_zero_idx]
    log_components = np.log(non_zero_components)
    loss = - np.sum(log_components)
    return loss

In [None]:
embedding_dim = 3

w_center = np.random.rand(vocab_size, embedding_dim)
w_outside = np.random.rand(embedding_dim, vocab_size)

epochs = 1000

losses = []
lr = 0.01

In [None]:
for epoch in range(1000 ):
    loss = 0

    for x, outside_word in training_data:
        h = np.dot(x, w_center)
        u = np.dot(h, w_outside)
        y_pred = softmax(u)
        loss += calculate_loss(outside_word, y_pred)
        e = np.sum([y_pred - ow for ow in outside_word], axis = 0)
        grad_w_outside = np.outer(h, e)
        grad_w_center = np.outer(x, np.dot(w_outside, e))
        w_outside = w_outside - (lr * grad_w_outside)
        w_center = w_center - (lr * grad_w_center)
    print(f'epoch: {epoch} loss: {loss}')
    losses.append(loss)

In [None]:
plt.plot(losses)
plt.show()

In [None]:

fig = plt.figure(figsize = (10, 10))


ax = fig.add_subplot(projection = '3d')

for i, (x, y, z) in enumerate(w_center):
    ax.scatter(x, y, z)
    ax.text(x + 0.01, y + 0.01, z,  vocab[i], size = 13)