In [46]:
import numpy as np
from collections import defaultdict

In [65]:
class Word2Vec(): 
    def __init__(self, wordSettings): 
        self.n = wordSettings['n']  #size of hidden layer(dimension of word embedding)
        self.windowSize = wordSettings['windowSize']
        self.epochs = wordSettings['epochs']
        self.learningRate = wordSettings['learningRate']
        
    def generateTrainingData(self, wordSettings, corpus): 
        wordCount = defaultdict(int)
        
        for row in corpus:
            for word in row: 
                wordCount[word] += 1
        
        totalWords = sum([freq**(3/4) for freq in wordCount.values()])
        self.rob = {word:(freq/totalWords)**(3/4) for word, freq in wordCount.items()}
        
        self.vocabCount = len(wordCount.keys())  #length of the vocabulary
        self.wordList = list(wordCount.keys())   #list of words
        self.wordIndex = dict((word, i) for i, word in enumerate(self.wordList))  #list of word index
        self.indexWord = dict((i, word) for i, word in enumerate(self.wordList))  #list of index word
        
        trainingData = []   # for each target word, it will hold all the context words
        
        for sentence in corpus: 
            sentenceLength = len(sentence)
            
            for targetWordIndex, word in enumerate(sentence):
                
                wordTarget = self.wordToOneHotVector(sentence[targetWordIndex])

                wordContext = []
                
                for contextWordIndex in range(targetWordIndex - self.windowSize, targetWordIndex + self.windowSize+1):
                    if contextWordIndex != targetWordIndex and contextWordIndex <= sentenceLength - 1 and contextWordIndex >=0:
                        wordContext.append(self.wordToOneHotVector(sentence[contextWordIndex]))
                trainingData.append([wordTarget, wordContext])
                
        return np.array(trainingData)
                        
    def wordToOneHotVector(self, word): 
        wordVec = np.zeros(self.vocabCount)
        wordIndex = self.wordIndex[word]
        wordVec[wordIndex] = 1
        return wordVec
        
    def train(self, trainingData): 
        self.weightToHidden = np.random.uniform(-1,1, (self.vocabCount, self.n))
        self.weightToOutput = np.random.uniform(-1,1, (self.n, self.vocabCount))

        for i in range(self.epochs): 
            self.loss = 0

            for wordTarget, wordContext in trainingData: 
                predMat, hiddenMat, outputMat = self.forwardPass(wordTarget)

                error = np.sum([np.subtract(predMat, word) for word in wordContext], axis=0)
                self.backpropagate(error,hiddenMat, wordTarget)
                    

    def forwardPass(self, wordTarget):
        wordTarget = [float(i) for i in wordTarget]
        hiddenMat = np.dot(wordTarget, self.weightToHidden)
        outputMat = np.dot(hiddenMat, self.weightToOutput)

        predMat = self.softmax(outputMat)

        return predMat, hiddenMat, outputMat

    def backpropagate(self, e, hiddenMat, wordTarget):
        dl_dweightOutput = np.outer(hiddenMat, e)                                           #Given two vectors, a = [a0, a1, ..., aM] and
        dl_dweightHidden = np.outer(wordTarget, np.dot(self.weightToOutput, e.T))           #b = [b0, b1, ..., bN], the outer product [1] is:
        # Update weights
        self.weightToHidden = self.weightToHidden - (self.learningRate * dl_dweightHidden)  #[[a0*b0  a0*b1 ... a0*bN ]
        self.weightToOutput = self.weightToOutput - (self.learningRate * dl_dweightOutput)  #[a1*b0    .
                                                                                            #[ ...          .
                                                                                            #[aM*b0            aM*bN ]]                                                                                                                                 
    def softmax(self, x): 
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    def getWordVec(self, word): 
        return self.weightToHidden[self.wordIndex[word]]
    
    def getSimiliarWords(self, word, top_n): 
        wordVectorGiven = self.getWordVec(word)
        givenWordIndex = self.wordIndex[word]
        similiarWords = {}

        for i in range(self.vocabCount):
            if i == givenWordIndex: 
                continue
            # Find the similary score for each word in vocab
            otherWordVector = self.weightToHidden[i]
            thetaSum = np.dot(wordVectorGiven, otherWordVector)
            thetaDen = np.linalg.norm(wordVectorGiven) * np.linalg.norm(otherWordVector)
            theta = thetaSum / thetaDen

            word = self.indexWord[i]
            similiarWords[word] = theta

        wordsSorted = sorted(similiarWords.items(), key=lambda kv: kv[1], reverse=True)

        for word, sim in wordsSorted[:top_n]:
            print(word, sim)


In [66]:
settings = {
    'windowSize': 2,
    'n': 10,                    # dimensions of word embeddings, also refer to size of hidden layer
    'epochs': 50,               # number of training epochs
    'learningRate': 0.01        # learning rate
}

text = "natural language processing and machine learning is fun and exciting"

corpus = [[word.lower() for word in text.split()]]


corpus = [[word.lower() for word in text.split()]]


w2v = Word2Vec(settings)

# Numpy ndarray with one-hot representation for [target_word, context_words]
trainingData = w2v.generateTrainingData(settings, corpus)

# Training
w2v.train(trainingData)

# Get vector for word
word = "processing"
vec = w2v.getWordVec(word)
print(word, vec)

# Find similar words
w2v.getSimiliarWords("processing", 3)

processing [-0.40259596  0.51464336 -1.03026994  0.09661151  0.52305125 -0.50828159
 -0.77421495  0.60269466 -0.28745338 -0.74410212]
natural 0.44053207004399
and 0.24343499016209455
learning 0.12260107657049182


  return np.array(trainingData)
