<a href="https://colab.research.google.com/github/fininsight/nlp-deeplearning-tutorial/blob/master/02_Prac_1_Word2Vec_skipgram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Word Embedding

#1 Word2Vec

##1.1 Skip-gram 직접구현

In [141]:
docs = ["natural language processing and machine learning is fun and exciting",
        "natural language processing and machine learning"]

In [142]:
from collections import defaultdict
import numpy as np 

class OneHotEncoder() :
  def __init__(self, docs) :
    # 고유 단어와 인덱스를 매칭시켜주는 사전 생성
    self.w2i = defaultdict(lambda:len(self.w2i))
    [self.w2i[w] for d in docs for w in d]
    self.i2w = {v:k for k,v in self.w2i.items()}

  def _get_one_hot_vector(self, w):
    v = [0]*len(self.w2i)
    v[self.w2i[w]] = 1
    return v

  def encode(self, docs) :
    ret = []
    for d in docs :
      tmp = []
      for w in d :
        tmp.append(self._get_one_hot_vector(w))
      ret.append(tmp)
    return ret

  def decode(self, v) :
    return self.i2w[v.index(1)]

![대체 텍스트](https://miro.medium.com/max/1400/1*uuVrJhSF89KGJPltvJ4xhg.png)

In [198]:
from tqdm.notebook import tqdm as tqdm

class Word2Vec() :
  def __iniit__(self):
    pass

  def _slide_window(self, encoded_docs) :
    win_size = 2
    ret = []

    for d in encoded_docs : 
      win_doc = []
      for i, w in enumerate(d) :
        s, e = max(0, i - win_size), min(len(d), i + win_size)
        context = d[s:e+1]
        center = context.pop(i-s)
        win_doc.append((center, context))
      ret.append(win_doc)
    return ret

  def _softmax(self, output) :
    return np.exp(output) / np.sum(np.exp(output))

  #SGD to backpropagate errors 
  def _backpropagation(self, W1, W2, hidden, predict, center, context, learning_rate) :
      err = (self._softmax(predict) - context).sum(axis=0)

      delta_W2 = np.outer(hidden, err)
      delta_W1 = np.outer(center, np.dot(W2, err))

      W1 = W1 - learning_rate * delta_W1
      W2 = W2 - learning_rate * delta_W2

      return W1, W2

  def skipgram(self, docs, embedding_size = 5, learning_rate = 0.001, epoch = 10) :
    tokenized_docs = [d.split() for d in docs]
    ohe = OneHotEncoder(tokenized_docs)
    encoded_docs = ohe.encode(tokenized_docs)
    
    onehot_size = len(encoded_docs[0][0])
    W1 = np.random.rand(onehot_size, embedding_size)
    W2 = np.random.rand(embedding_size, onehot_size)

    sliding_docs = self._slide_window(encoded_docs)
    for i in tqdm(range(epoch), desc='word embedding') :
      for d in  sliding_docs:
        for center, context in d : 
          hidden = np.dot(center, W1)
          predict = np.dot(hidden, W2)
          W1, W2 = self._backpropagation(W1, W2, hidden, predict, center, context, learning_rate)
    
    self.i2w = ohe.i2w
    self.wv_ = {self.i2w[i]:list(we) for i, we in enumerate(W1)}
    self.word_vectors = W1

    return self.wv_

  def most_similar(self, word, n = 3) :
    v = self.wv_[word]
    similarity = np.dot(v, self.word_vectors.T)
    return [(self.i2w[i], similarity[i], self.word_vectors[i]) for i in similarity.argsort()[::-1][1:n+1]]



In [199]:
sg = Word2Vec()
sg.skipgram(docs, epoch=1000)
print(sg.wv_['natural'])
sg.most_similar('natural')

HBox(children=(FloatProgress(value=0.0, description='word embedding', max=1000.0, style=ProgressStyle(descript…


[0.03396338642025993, 0.7732812789567359, 1.668441363166191, 0.712571167105562, -0.7448306217179556]


[('processing',
  2.746783796570668,
  array([0.41944852, 0.5124866 , 1.57543873, 0.39955891, 0.77467237])),
 ('language',
  2.7204295776280984,
  array([0.19363369, 1.73634297, 1.08111746, 0.33134178, 0.89780691])),
 ('and',
  0.8495381237043158,
  array([-0.10656914, -0.37016593,  0.06767088,  1.15044729, -0.27753893]))]