In [2]:
import numpy as np 
import string 
from nltk.corpus import stopwords 

def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 

class word2vec(object): 
    def __init__(self): 
        self.N = 10
        self.X_train = [] 
        self.y_train = [] 
        self.window_size = 2
        self.alpha = 0.001
        self.words = [] 
        self.word_index = {} 

    def initialize(self,V,data): 
        self.V = V 
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
        
        self.words = data 
        for i in range(len(data)): 
            self.word_index[data[i]] = i 


    def feed_forward(self,X): 
        self.h = np.dot(self.W.T,X).reshape(self.N,1) 
        self.u = np.dot(self.W1.T,self.h) 
        #print(self.u) 
        self.y = softmax(self.u) 
        return self.y 
        
    def backpropagate(self,x,t): 
        e = self.y - np.asarray(t).reshape(self.V,1) 
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
        
    def train(self,epochs): 
        for x in range(1,epochs):
            self.loss = 0
            for j in range(len(self.X_train)): 
                self.feed_forward(self.X_train[j]) 
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
            
    def predict(self,word,number_of_predictions): 
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X) 
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
            
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
    
            return top_context_words 
        else: 
            print("Word not found in dicitonary")



ModuleNotFoundError: No module named 'nltk'

In [2]:
def preprocessing(corpus): 
    stop_words = set(stopwords.words('english'))
    training_data = [] 
    sentences = corpus.split(".") 
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip() 
        sentence = sentences[i].split() 
        x = [word.strip(string.punctuation) for word in sentence 
                                    if word not in stop_words] 
        x = [word.lower() for word in x] 
        training_data.append(x) 
    return training_data 
    

def prepare_data_for_training(sentences,w2v): 
    data = {} 
    for sentence in sentences: 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) 
    data = sorted(list(data.keys())) 
    vocab = {} 
    for i in range(len(data)): 
        vocab[data[i]] = i 
    
    #for i in range(len(words)): 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
            
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 

    return w2v.X_train,w2v.y_train 


In [5]:
corpus = "" 
corpus += "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000

training_data = preprocessing(corpus) 
w2v = word2vec() 

prepare_data_for_training(training_data,w2v) 
w2v.train(epochs) 

print(w2v.predict("revolves",3))


epoch  1  loss =  44.840167573247605
epoch  2  loss =  44.73607780191064
epoch  3  loss =  44.632909019106364
epoch  4  loss =  44.53075030724695
epoch  5  loss =  44.42968657511885
epoch  6  loss =  44.329798142910995
epoch  7  loss =  44.23116038085036
epoch  8  loss =  44.13384340507927
epoch  9  loss =  44.03791183313798
epoch  10  loss =  43.943424600136204
epoch  11  loss =  43.850434835453626
epoch  12  loss =  43.75898979864179
epoch  13  loss =  43.669130872139
epoch  14  loss =  43.580893607486814
epoch  15  loss =  43.49430782096532
epoch  16  loss =  43.40939773396017
epoch  17  loss =  43.32618215293691
epoch  18  loss =  43.24467468362654
epoch  19  loss =  43.16488397390941
epoch  20  loss =  43.08681397990933
epoch  21  loss =  43.010464249957636
epoch  22  loss =  42.93583022133955
epoch  23  loss =  42.86290352506988
epoch  24  loss =  42.79167229434352
epoch  25  loss =  42.72212147274652
epoch  26  loss =  42.654233118779324
epoch  27  loss =  42.58798670371782
epoc

epoch  246  loss =  39.60891013247807
epoch  247  loss =  39.60672358679348
epoch  248  loss =  39.60455459866512
epoch  249  loss =  39.60240295963513
epoch  250  loss =  39.60026846449874
epoch  251  loss =  39.59815091124174
epoch  252  loss =  39.5960501009793
epoch  253  loss =  39.59396583789633
epoch  254  loss =  39.59189792918896
epoch  255  loss =  39.589846185007566
epoch  256  loss =  39.587810418400885
epoch  257  loss =  39.585790445261516
epoch  258  loss =  39.58378608427257
epoch  259  loss =  39.58179715685552
epoch  260  loss =  39.57982348711912
epoch  261  loss =  39.5778649018097
epoch  262  loss =  39.575921230262175
epoch  263  loss =  39.573992304352494
epoch  264  loss =  39.572077958450926
epoch  265  loss =  39.57017802937634
epoch  266  loss =  39.56829235635161
epoch  267  loss =  39.5664207809599
epoch  268  loss =  39.564563147101886
epoch  269  loss =  39.56271930095391
epoch  270  loss =  39.56088909092704
epoch  271  loss =  39.55907236762703
epoch  2

epoch  530  loss =  39.31970460588878
epoch  531  loss =  39.31923460215587
epoch  532  loss =  39.318766375811485
epoch  533  loss =  39.318299916815675
epoch  534  loss =  39.31783521520376
epoch  535  loss =  39.31737226108569
epoch  536  loss =  39.316911044645245
epoch  537  loss =  39.31645155613949
epoch  538  loss =  39.31599378589801
epoch  539  loss =  39.31553772432225
epoch  540  loss =  39.31508336188486
epoch  541  loss =  39.314630689129025
epoch  542  loss =  39.31417969666782
epoch  543  loss =  39.3137303751836
epoch  544  loss =  39.313282715427285
epoch  545  loss =  39.312836708217816
epoch  546  loss =  39.312392344441484
epoch  547  loss =  39.31194961505132
epoch  548  loss =  39.311508511066506
epoch  549  loss =  39.31106902357175
epoch  550  loss =  39.310631143716726
epoch  551  loss =  39.310194862715406
epoch  552  loss =  39.30976017184558
epoch  553  loss =  39.309327062448226
epoch  554  loss =  39.30889552592691
epoch  555  loss =  39.30846555374731
ep

epoch  800  loss =  39.23566384733419
epoch  801  loss =  39.23545841675292
epoch  802  loss =  39.23525350128079
epoch  803  loss =  39.23504909898478
epoch  804  loss =  39.234845207941504
epoch  805  loss =  39.23464182623719
epoch  806  loss =  39.234438951967576
epoch  807  loss =  39.2342365832379
epoch  808  loss =  39.23403471816278
epoch  809  loss =  39.233833354866235
epoch  810  loss =  39.23363249148151
epoch  811  loss =  39.233432126151136
epoch  812  loss =  39.23323225702682
epoch  813  loss =  39.23303288226937
epoch  814  loss =  39.23283400004873
epoch  815  loss =  39.232635608543774
epoch  816  loss =  39.23243770594239
epoch  817  loss =  39.23224029044134
epoch  818  loss =  39.23204336024623
epoch  819  loss =  39.23184691357149
epoch  820  loss =  39.231650948640265
epoch  821  loss =  39.23145546368441
epoch  822  loss =  39.23126045694438
epoch  823  loss =  39.23106592666926
epoch  824  loss =  39.230871871116605
epoch  825  loss =  39.230678288552475
epoch