In [2]:
import numpy as np

In [65]:
class word2vec():
  '''Coverts words to vectors using skip-gram method'''
  def __init__(self,corpus,window,epochs,alpha):
    '''Input training data and parameters: (corpus, window size, epochs, alpha (learning rate))'''
    self.corpus = corpus
    self.window = window
    self.epochs = epochs
    self.alpha = alpha
  def preprocess(self,corpus):
    '''Preprocess the data (converts to lower case). Outputs the preprocessed corpus and a list of vocabulary'''
    #Convert to lower case
    corpus = self.corpus.lower().split()
    #build vocabulary of unique words
    vocab = []
    for i in corpus:
      if not i in vocab:
        vocab.append(i)
    #Return preprocessed corpus and vocabulary of unique words
    return corpus, vocab

  def onehot(self):
    '''Creates one hot vectors for every unique word in the vocabulary'''
    #Extract vocabulary from the preprocessing function
    vocab = self.preprocess(self.corpus)[1]
    #Build a list of one hot vectors for the words from vocabulary
    vec_list = []
    for j in range(len(vocab)):
      word_vec = np.zeros(len(vocab))
      word_vec[j] = 1
      vec_list.append(word_vec)
    return vec_list

  def train(self):
    '''Trains the model and returns the final weights (word vectors)'''
    #Extracting corpus from the preprocessing function
    corpus = self.preprocess(self.corpus)[0]
    #Create random weight vectors to initiate. Every word will be expressed using a 10 dimensional vector
    self.w1 = np.random.uniform(-1,1,(len(self.preprocess(self.corpus)[1]),10))
    self.w2 = np.random.uniform(-1,1,(10,len(self.preprocess(self.corpus)[1])))
    #Initiate training
    for i in range(self.epochs):
      #Initiate loss variable
      loss = 0
      #Slide through each window depending on the window size and increment by the target word
      for j, word in enumerate(corpus):
        #Extracting the current window
        first = j-self.window
        last = j+self.window+1
        if first < 0:
          first = 0
        if last > len(corpus):
          last = len(corpus)
        #Current window
        current_window = corpus[first:last]
        #Target word and one hot encoding for it
        target_word = word
        target_index = self.preprocess(self.corpus)[1].index(target_word)
        target_vec = self.onehot()[target_index]
        #Context words and one hot encoding for every context word
        context_words = [word for word in current_window if word not in target_word]
        context_vecs = []
        for context_word in context_words:
          context_index = self.preprocess(self.corpus)[1].index(context_word)
          context_vecs.append(self.onehot()[context_index])

        ########################################  FORWARD PASS #########################################################
        h,u,y_pred = self.forwardpass(target_vec,self.w1,self.w2)

        ######################################## BACKPROP ################################################################
        error,d1_w1,d1_w2 = self.backprop(target_vec,context_vecs,h,u,y_pred,self.w2)

        ######################################## ADJUSTING WEIGHTS ####################################################
        self.w1 = self.w1 - self.alpha * d1_w1
        self.w2 = self.w2 - self.alpha * d1_w2

        ######################################## CALCULATE LOSS FUNCTION ##############################################
        loss += -np.sum([u[self.preprocess(self.corpus)[1].index(word)] for word in context_words]) + len(context_words) * np.log(np.sum(np.exp(u)))

      # Print loss after every epoch
      print('Epoch',i,'loss:',loss)
    return (self.w1)
  def forwardpass(self, target_vec, w1, w2):
    '''Performs the forward pass'''
    h = np.dot(target_vec,w1)   #Hidden layer
    u = np.dot(h,w2)     #Output layer
    y_pred = self.softmax(u)     #Softmax output
    return (h,u,y_pred)

  def softmax(self,u):
    '''Outputs the softmax function for a given vector'''
    e = np.exp(u - np.max(u))
    soft_fun = e/e.sum(axis=0)
    return (soft_fun)

  def backprop(self,target_vec,context_vecs,h,u,y_pred,w2):
    '''Performs backpropagation'''
    error = sum(y_pred - context_vecs)
    d1_w2 = np.outer (h,error)   #gradient for w2
    d1_w1 = np.outer (target_vec,np.dot(w2, error.T))  #gradient for w1
    return(error,d1_w1,d1_w2)

  def word_vec(self,word):
    '''Outputs the word vector for a given word'''
    word_index = self.preprocess(self.corpus)[1].index(word)
    word_vec = self.w1[word_index]
    return (word_vec) 

In [66]:
corpus = 'He speaks good marathi. He is proficient in marathi.'

In [67]:
word2vec_mod = word2vec(corpus,window=2,epochs=50,alpha=0.01)

In [68]:
weights = word2vec_mod.train()

Epoch 0 loss: 78.13317516963546
Epoch 1 loss: 74.86642028848966
Epoch 2 loss: 72.1666897516697
Epoch 3 loss: 69.89479294729816
Epoch 4 loss: 67.95247757888289
Epoch 5 loss: 66.2693997466885
Epoch 6 loss: 64.79419869970351
Epoch 7 loss: 63.48857588541102
Epoch 8 loss: 62.323383108856625
Epoch 9 loss: 61.276013621501754
Epoch 10 loss: 60.328634131940795
Epoch 11 loss: 59.466963790391254
Epoch 12 loss: 58.679414005898195
Epoch 13 loss: 57.95647043398482
Epoch 14 loss: 57.29024053222467
Epoch 15 loss: 56.674116456183214
Epoch 16 loss: 56.102519798570064
Epoch 17 loss: 55.57070544049834
Epoch 18 loss: 55.07460882159977
Epoch 19 loss: 54.610725611471004
Epoch 20 loss: 54.17601592153372
Epoch 21 loss: 53.7678273607602
Epoch 22 loss: 53.38383274490188
Epoch 23 loss: 53.02197933187329
Epoch 24 loss: 52.68044721632512
Epoch 25 loss: 52.35761506745395
Epoch 26 loss: 52.05203179854349
Epoch 27 loss: 51.762393057480125
Epoch 28 loss: 51.487521654145205
Epoch 29 loss: 51.22635121382343
Epoch 30 loss

In [70]:
word2vec_mod.word_vec('good')

array([ 0.99304862, -0.24612498, -0.34696777, -0.51852856,  0.27075523,
       -0.06650622, -0.38466629, -0.51190019, -0.27699964, -0.50457998])