<a href="https://colab.research.google.com/github/erfan-kalandi/Word2Vec-From-Scratch/blob/main/Word2Vec_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Word2Vec Implementation from Scratch

 Word2Vec is a popular technique for word embeddings, which captures the meaning of words by placing them in a continuous vector space.
 In this code, we will implement Word2Vec using NumPy.
We will represent each word as a one-hot vector, meaning each word in the vocabulary is mapped to a unique binary vector with only one active (1) position.

In [None]:
import numpy as np
from collections import defaultdict

In [None]:
settings = {
	'window_size': 2,
	'n': 10,
	'epochs': 200,
	'learning_rate': 0.01
}

In [None]:
class word2vec:
    def __init__(self, settings):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']

    def generate_training_data(self, corpus):
        word_counts = defaultdict(int)
        for sentence in corpus:
            for word in sentence:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        self.words_list = list(word_counts.keys())

        self.word_index = {}
        self.index_word = {}
        for i, word in enumerate(self.words_list):
          self.word_index[word] = i
          self.index_word[i] = word

        training_data = []
        for line in corpus:
          for i in range(len(line)):
              target = sentence[i]
              context = [
                  self.word2onehot(sentence[j])
                  for j in range(i - self.window, i + self.window)
                  if i != j and 0 <= j < len(sentence)
              ]
          training_data.append([self.word2onehot(target),  np.array(context)])

        return training_data

    def word2onehot(self, word):
        word_vec = np.zeros(self.v_count)
        word_vec[self.word_index[word]] = 1
        return word_vec

    def train(self, training_data):
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
        for i in range(self.epochs):
            self.loss = 0
            for target, context in training_data:
              x = target
              y_c, h, u = self.forward_pass(target)
              error = np.sum([np.subtract(y_c, word) for word in context], axis=0)

              self.backprop(error, h, target)

              loss_for_context = 0
              for word in context:
                  true_word_index = np.argmax(word)
                  loss_for_context += -np.log(y_c[true_word_index])
              self.loss += loss_for_context

            print('Epoch:', i, "Loss:", self.loss)

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def forward_pass(self, x):
        h = np.dot(x, self.w1)
        u = np.dot(h, self.w2)
        y_c = self.softmax(u)

        return y_c, h, u

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)

    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w

    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):
          v_w2 = self.w1[i]
          theta_sum = np.dot(v_w1, v_w2)
          theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
          theta = theta_sum / theta_den

          word = self.index_word[i]
          word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
          print(word, sim)


In [None]:
text = "Natural language processing and machine learning open up fascinating possibilities, allowing machines to analyze,\
 understand, and respond to human language in ways that were once thought impossible."

In [None]:
corpus = [[word.lower() for word in text.split()]]

w2v = word2vec(settings)

training_data = w2v.generate_training_data(corpus)

w2v.train(training_data)

In [None]:
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)

w2v.vec_sim("machine", 5)