In [1]:
import numpy as np
from collections import defaultdict
import math
import random

In [118]:
class Word2Vec(): 
    def __init__(self, wordSettings): 
        self.n = wordSettings['n']  #size of hidden layer(dimension of word embedding)
        self.windowSize = wordSettings['windowSize']
        self.epochs = wordSettings['epochs']
        self.learningRate = wordSettings['learningRate']
    
    def generateTrainingData(self, wordSettings, corpus): 
        wordCount = defaultdict(int)
        
        for row in corpus:
            for word in row: 
                wordCount[word] += 1
        
#         totalWords = sum([freq**(3/4) for freq in wordCount.values()])
#         wordProb = {word:(freq/totalWords)**(3/4) for word, freq in wordCount.items()}
        
#         wordProb = sorted(wordProb.items(), key=lambda kv: kv[1], reverse=True)
        
#         print(wordProb)
        
        self.vocabCount = len(wordCount.keys())  #length of the vocabulary
        self.wordList = list(wordCount.keys())   #list of words
        self.wordIndex = dict((word, i) for i, word in enumerate(self.wordList))  #list of word index
        self.indexWord = dict((i, word) for i, word in enumerate(self.wordList))  #list of index word
        
        trainingData = []   # for each target word, it will hold all the context words
        
        for sentence in corpus: 
            sentenceLength = len(sentence)
            
            for targetWordIndex, word in enumerate(sentence):
                

                wordContext = []
                
                for contextWordIndex in range(targetWordIndex - self.windowSize, targetWordIndex + self.windowSize+1):
                    if contextWordIndex != targetWordIndex and contextWordIndex <= sentenceLength - 1 and contextWordIndex >=0:
                        wordContext.append(contextWordIndex)
                trainingData.append([targetWordIndex, wordContext])
                
        return np.array(trainingData)

    
    def findWordindex(self, word): 
        return self.wordIndex[word]
    
    def train(self, trainingData): 
        self.weightToHidden = np.random.uniform(-1,1, (self.vocabCount+1, self.n))
        self.weightToOutput = np.random.uniform(-1,1, (self.n, self.vocabCount+1))

        for i in range(self.epochs): 
            self.loss = 0
            
            for wordTarget, wordContext in trainingData: 
                error = np.zeros(self.vocabCount+1)
                hiddenMat = self.weightToHidden[wordTarget,:]
                w_c = []
                randomSample = []
                for wc in wordContext: 
                    randomSample = self.generateRandomSample(wordContext, wordTarget)
                    randomSample.append(wc)
                    randomSample = np.sort(randomSample)
                    
                    wt = np.zeros(self.vocabCount + 1)
                    wt[wc] = 1
                    w_c.append(wt)    #should have used one hot vector instead of index while generating training data
                    
                outputMatrix = np.zeros(self.vocabCount+1)
                    
                for w in randomSample:
                    outputMatrix[w] = self.sigmoid(np.dot(hiddenMat , self.weightToOutput[:,w]))
                
            error = np.sum([np.subtract(outputMatrix, word) for word in w_c], axis=0)
            self.backpropagate(error, hiddenMat, wt)
#         print(self.weightToHidden)
                            
    
    def backpropagate(self, error, hiddenMat, wordTarget):
        for i,e in enumerate(error):
            self.weightToHidden[i,:] -= e * self.learningRate
            self.weightToOutput[:,i] -= e * self.learningRate
                            
                        
                        
    
    def generateRandomSample(self, wordContext, wt):
        count = 0
        randSample = []
        while count < 3: 
            index = random.randint(0, (self.vocabCount))
            if index != wt and index not in wordContext and index not in randSample: 
                randSample.append(index)
                count += 1
        return randSample
    

    
    def sigmoid(self, x):
        try:
            return 1 / (1 + math.exp(-x))
        except OverflowError:
            return 0
        
    
    def printWordVector(self, word): 
        wordIndex = self.wordIndex[word]
        print(word, self.weightToHidden[wordIndex])

In [128]:
settings = {
    'windowSize': 2,
    'n': 7,                    # dimensions of word embeddings, also refer to size of hidden layer
    'epochs': 50,               # number of training epochs
    'learningRate': 0.1        # learning rate
}

text = "natural language processing and machine learning is fun and exciting"

corpus = [[word.lower() for word in text.split()]]


w2v = Word2Vec(settings)

trainingData = w2v.generateTrainingData(settings, corpus)
w2v.train(trainingData)
print(trainingData)
w2v.printWordVector('machine')

[[0 list([1, 2])]
 [1 list([0, 2, 3])]
 [2 list([0, 1, 3, 4])]
 [3 list([1, 2, 4, 5])]
 [4 list([2, 3, 5, 6])]
 [5 list([3, 4, 6, 7])]
 [6 list([4, 5, 7, 8])]
 [7 list([5, 6, 8, 9])]
 [8 list([6, 7, 9])]
 [9 list([7, 8])]]
machine [-1.08555227  0.35426376 -1.20919577 -0.70604796 -0.78444256  0.21721254
  0.19047188]


  return np.array(trainingData)
