<a href="https://colab.research.google.com/github/fininsight/nlp-deeplearning-tutorial/blob/master/02_Prac_1_Word2Vec_skipgram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Word Embedding

#1 Word2Vec

##1.1 Skip-gram 직접구현

In [80]:
docs = ["natural language processing and machine learning is fun and exciting",
        "natural language processing and machine learning"]

In [108]:
from collections import defaultdict
import numpy as np 

class OneHotEncoder() :
  def __init__(self, docs) :
    # 고유 단어와 인덱스를 매칭시켜주는 사전 생성
    self.w2i = defaultdict(lambda:len(self.w2i))
    [self.w2i[w] for d in docs for w in d]
    self.i2w = {v:k for k,v in self.w2i.items()}

  def _get_one_hot_vector(self, w):
    v = [0]*len(self.w2i)
    v[self.w2i[w]] = 1
    return v

  def encode(self, docs) :
    ret = []
    for d in docs :
      tmp = []
      for w in d :
        tmp.append(self._get_one_hot_vector(w))
      ret.append(tmp)
    return ret

  def decode(self, v) :
    return self.i2w[v.index(1)]

tokenized_docs = [d.split() for d in docs]
ohe = OneHotEncoder(tokenized_docs)
encoded_docs = ohe.encode(tokenized_docs)
encoded_docs

[[[1, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 1, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 1, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 1]],
 [[1, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 0]]]

In [131]:
def slide_window(encoded_docs) :
  win_size = 2
  ret = []

  for d in encoded_docs : 
    win_doc = []
    for i, w in enumerate(d) :
      s, e = max(0, i - win_size), min(len(d), i + win_size)
      context = d[s:e+1]
      center = context.pop(i-s)
      win_doc.append((center, context))
    ret.append(win_doc)
  return ret

![대체 텍스트](https://miro.medium.com/max/1400/1*uuVrJhSF89KGJPltvJ4xhg.png)

In [138]:
def softmax(output) :
  return np.exp(output) / np.sum(np.exp(output))

#SGD to backpropagate errors 
def backpropagation(W1, W2, hidden, predict, center, context, learning_rate) :
    err = (softmax(predict) - context).sum(axis=0)

    delta_W2 = np.outer(hidden, err)
    delta_W1 = np.outer(center, np.dot(W2, err))

    W1 = W1 - learning_rate * delta_W1
    W2 = W2 - learning_rate * delta_W2

    return W1, W2

In [140]:
onehot_size = len(encoded_docs[0][0])
embedding_size = 5
learning_rate = 0.001
epoch = 2

W1 = np.random.rand(onehot_size, embedding_size)
W2 = np.random.rand(embedding_size, onehot_size)

for i in range(epoch) :
  for d in slide_window(encoded_docs) :
    for center, context in d : 
      hidden = np.dot(center, W1)
      predict = np.dot(hidden, W2)
      W1, W2 = backpropagation(W1, W2, hidden, predict, center, context, learning_rate)

    np.log(np.sum(np.exp(u))

  self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
    

      print(W1)



[[0.26041504 0.92153019 0.43489931 0.1739126  0.07203218]
 [0.49973765 0.40210404 0.95331045 0.832181   0.20565733]
 [0.82901255 0.88172228 0.02798924 0.15746927 0.92429208]
 [0.2057222  0.67240718 0.70000297 0.09461725 0.13704983]
 [0.04341349 0.94100868 0.55460107 0.75780731 0.00860506]
 [0.97816839 0.60889983 0.87136236 0.81359383 0.57016172]
 [0.29852799 0.55126335 0.35412346 0.72496239 0.10452159]
 [0.38638445 0.61889755 0.75788279 0.70098857 0.40555984]
 [0.43624882 0.39230823 0.39652264 0.81799001 0.9861178 ]]
[[0.26041504 0.92153019 0.43489931 0.1739126  0.07203218]
 [0.49903593 0.40291537 0.95332648 0.83176616 0.20560173]
 [0.82901255 0.88172228 0.02798924 0.15746927 0.92429208]
 [0.2057222  0.67240718 0.70000297 0.09461725 0.13704983]
 [0.04341349 0.94100868 0.55460107 0.75780731 0.00860506]
 [0.97816839 0.60889983 0.87136236 0.81359383 0.57016172]
 [0.29852799 0.55126335 0.35412346 0.72496239 0.10452159]
 [0.38638445 0.61889755 0.75788279 0.70098857 0.40555984]
 [0.43624882 

In [None]:
    # Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
    h = np.dot(self.w1.T, x)
    # Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
    u = np.dot(self.w2.T, h)
    # Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
    y_c = self.softmax(u)

In [None]:
  for i in range(self.epochs):
    self.loss = 0
    for w_t, w_c in training_data:
      EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
      self.backprop(EI, h, w_t)

      # Calculate loss
      # There are 2 parts to the loss function
      # Part 1: -ve sum of all the output +
      # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
      # Note: word.index(1) returns the index in the context word vector with value 1
      # Note: u[word.index(1)] returns the value of the output layer before softmax
      self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
    print('Epoch:', i, "Loss:", self.loss)