Based on this tutorial: https://towardsdatascience.com/an-implementation-guide-to-word2vec-using-numpy-and-google-sheets-13445eebd281

In [15]:
import numpy as np
import numba

# 1. Data Preparation

In [2]:
text = "natural language processing and machine learning are fun and exciting"
corpus = np.array([[w.lower() for w in text.split()]])

# 2. Hyperparameters

In [3]:
settings = {
    "window_size": 2,
    "n": 10,
    "epochs": 50,
    "eta": 0.01
}

# 3. Define word2vec Model and Generate Training Data

In [26]:
class word2vec():
    
    def __init__(self, settings):
        self.n = settings["n"]
        self.eta = settings["eta"]
        self.epochs = settings["epochs"]
        self.window_size = settings["window_size"]
        self.vocab_size = 0
        self.vocab = []
        self.word2index = {}
        self.index2word = {}
    
    def generate_training_data(self, corpus):
        assert type(corpus) == np.ndarray
        self.vocab = np.array(list(set(corpus.flatten())))
        self.vocab_size = len(self.vocab)
        for i, wd in enumerate(self.vocab):
            self.word2index[wd] = i
            self.index2word[i] = wd
        
        training_data = []
        
        for sentence in corpus:
            sentence_len = len(sentence)
            for i, wd in enumerate(sentence):
                target = self.word2one_hot(wd)
                context = []
                
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j >= 0 and j != i and j < sentence_len:
                        context.append(self.word2one_hot(sentence[j]))
                training_data.append([target, np.array(context)])
        return np.array(training_data)
                
                
            
    
    def word2one_hot(self, word):
        one_hot = np.zeros(shape=(self.vocab_size,), dtype=np.int32)
        one_hot[self.word2index[word]] = 1
        return one_hot

In [30]:
w2v = word2vec(settings)
X_train = w2v.generate_training_data(corpus)
X_train[0]

array([array([0, 0, 0, 0, 0, 1, 0, 0, 0]),
       array([[0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0]])], dtype=object)