In [1]:
import numpy as np
from collections import defaultdict

In [2]:
settings = {
    'window_size': 2,
    'n': 10,
    'epochs': 50,
    'learning_rate': 0.01
}

In [3]:
class word2vec():
    #иницилизация гиперпараметров, а также текста и размера матриц, который зависит от текста
    def __init__(self):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        self.your_text = your_text
        self.output_size = len(your_text.split())
        self.input_size = len(your_text.split()) + 1
        self.getW1 = np.array([[np.random.normal(loc=0, scale=0.01) for i in range(self.input_size)]
                               for x in range(self.output_size)])
        self.getW2 = np.array([[np.random.normal(loc=0, scale=0.01) for i in range(self.output_size)]
                               for x in range(self.input_size)])
        
    #генерируем тренировочную дату
    def generate_training_data(self, settings, corpus):        
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        self.v_count = len(word_counts.keys())
                           
        self.words_list = list(word_counts.keys())
                           
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
                           
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
                           
        training_data = []
        
        corpus = [[word.lower() for word in self.your_text.split()]]
        
        for sentence in corpus:
            sent_len = len(sentence)
            
            for i, word in enumerate(sentence):
                w_target = self.word2onehot(sentence[i])

                w_context = []  
                
                for j in range(i - self.window, i + self.window+1):
                    if j != i and j <= sent_len-1 and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target, w_context])
        return np.array(training_data)
      
                           
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec
                           
    def train(self, training_data):
        self.w1 = np.array(self.getW1)
        self.w2 = np.array(self.getW2)
                           
        
        for i in range(self.epochs):
            self.loss = 0
            for w_t, w_c in training_data:
                y_pred, h, u = self.forward_pass(w_t)
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
                           
                self.backprop(EI, h, w_t)
                
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c)
                           
            print('Epoch:', i, "Loss:", self.loss)
            
    def forward_pass(self, x):
        h = np.dot(x, self.w1)
        u = np.dot(h, self.w2)
        y_c = self.softmax(u)
        return y_c, h, u
        
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
                           
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
                           
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
                           
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w
                      

In [4]:
your_text = 'one two free' #наш текст
def train(your_text): #функция для возвращения словаря
    w2v = word2vec()
    corpus = [[word.lower() for word in your_text.split()]]
    training_data = w2v.generate_training_data(settings, corpus)
    w2v.train(training_data)
    w2v_dict = {word: w2v.word_vec(word) for word in your_text.split()} #создаём словарь
    return w2v_dict

In [5]:
train(your_text) #пример1

Epoch: 0 Loss: 5.999238985756092
Epoch: 1 Loss: 5.999232035150097
Epoch: 2 Loss: 5.99922497457178
Epoch: 3 Loss: 5.999217801214104
Epoch: 4 Loss: 5.9992105122255115
Epoch: 5 Loss: 5.99920310470878
Epoch: 6 Loss: 5.9991955757198845
Epoch: 7 Loss: 5.999187922266826
Epoch: 8 Loss: 5.999180141308451
Epoch: 9 Loss: 5.999172229753244
Epoch: 10 Loss: 5.999164184458104
Epoch: 11 Loss: 5.999156002227105
Epoch: 12 Loss: 5.999147679810226
Epoch: 13 Loss: 5.9991392139020645
Epoch: 14 Loss: 5.999130601140537
Epoch: 15 Loss: 5.999121838105539
Epoch: 16 Loss: 5.999112921317598
Epoch: 17 Loss: 5.999103847236496
Epoch: 18 Loss: 5.999094612259868
Epoch: 19 Loss: 5.999085212721781
Epoch: 20 Loss: 5.999075644891283
Epoch: 21 Loss: 5.9990659049709265
Epoch: 22 Loss: 5.9990559890952735
Epoch: 23 Loss: 5.999045893329359
Epoch: 24 Loss: 5.999035613667143
Epoch: 25 Loss: 5.999025146029928
Epoch: 26 Loss: 5.999014486264743
Epoch: 27 Loss: 5.999003630142708
Epoch: 28 Loss: 5.998992573357359
Epoch: 29 Loss: 5.998

{'one': array([ 0.01200067,  0.00646814, -0.00113942,  0.00237128]),
 'two': array([-0.00333234,  0.00654017, -0.01049422,  0.01786332]),
 'free': array([-0.01690989, -0.00951925, -0.01364383,  0.00172942])}

In [6]:
your_text = 'one two free four five'
train(your_text) #пример 2

Epoch: 0 Loss: 13.99930315898227
Epoch: 1 Loss: 13.999249838134842
Epoch: 2 Loss: 13.999196126860456
Epoch: 3 Loss: 13.999141982889308
Epoch: 4 Loss: 13.999087363450775
Epoch: 5 Loss: 13.999032225231431
Epoch: 6 Loss: 13.998976524332509
Epoch: 7 Loss: 13.998920216226733
Epoch: 8 Loss: 13.998863255714522
Epoch: 9 Loss: 13.998805596879505
Epoch: 10 Loss: 13.998747193043279
Epoch: 11 Loss: 13.998687996719411
Epoch: 12 Loss: 13.998627959566587
Epoch: 13 Loss: 13.998567032340894
Epoch: 14 Loss: 13.998505164847195
Epoch: 15 Loss: 13.9984423058895
Epoch: 16 Loss: 13.998378403220329
Epoch: 17 Loss: 13.998313403489018
Epoch: 18 Loss: 13.99824725218887
Epoch: 19 Loss: 13.99817989360315
Epoch: 20 Loss: 13.998111270749828
Epoch: 21 Loss: 13.998041325325062
Epoch: 22 Loss: 13.997969997645308
Epoch: 23 Loss: 13.997897226588046
Epoch: 24 Loss: 13.997822949531031
Epoch: 25 Loss: 13.997747102290043
Epoch: 26 Loss: 13.997669619055028
Epoch: 27 Loss: 13.997590432324618
Epoch: 28 Loss: 13.997509472838905


{'one': array([ 0.00550657,  0.00587883,  0.0065588 , -0.01594234, -0.02094437,
        -0.01457493]),
 'two': array([ 0.01289824, -0.01709041,  0.01038659, -0.00306691,  0.02860904,
         0.00128626]),
 'free': array([ 0.00376701, -0.00985002,  0.01274947, -0.00328491, -0.00874226,
        -0.00328365]),
 'four': array([-0.02284178,  0.01747276,  0.00507223,  0.00632609, -0.02170538,
         0.00569176]),
 'five': array([-0.00652117, -0.00286432, -0.00620127,  0.01358764, -0.00085349,
         0.00171116])}

In [7]:
your_text = 'one two free four five six'
train(your_text) #пример 3

Epoch: 0 Loss: 17.999083488915865
Epoch: 1 Loss: 17.998945584038708
Epoch: 2 Loss: 17.998806590007636
Epoch: 3 Loss: 17.99866636922765
Epoch: 4 Loss: 17.998524782908664
Epoch: 5 Loss: 17.9983816909126
Epoch: 6 Loss: 17.998236951599047
Epoch: 7 Loss: 17.998090421669403
Epoch: 8 Loss: 17.997941956009225
Epoch: 9 Loss: 17.99779140752867
Epoch: 10 Loss: 17.99763862700085
Epoch: 11 Loss: 17.997483462897872
Epoch: 12 Loss: 17.99732576122441
Epoch: 13 Loss: 17.997165365348657
Epoch: 14 Loss: 17.99700211583037
Epoch: 15 Loss: 17.99683585024593
Epoch: 16 Loss: 17.99666640301013
Epoch: 17 Loss: 17.99649360519457
Epoch: 18 Loss: 17.996317284342354
Epoch: 19 Loss: 17.99613726427903
Epoch: 20 Loss: 17.995953364919412
Epoch: 21 Loss: 17.995765402070163
Epoch: 22 Loss: 17.995573187227915
Epoch: 23 Loss: 17.99537652737266
Epoch: 24 Loss: 17.995175224756217
Epoch: 25 Loss: 17.99496907668556
Epoch: 26 Loss: 17.994757875300714
Epoch: 27 Loss: 17.994541407347036
Epoch: 28 Loss: 17.99431945394159
Epoch: 29

{'one': array([ 0.0004641 ,  0.0114827 ,  0.01128569, -0.01921965,  0.01070365,
        -0.01269281,  0.00998191]),
 'two': array([ 0.00847985, -0.03666456, -0.01834598, -0.00728984,  0.01102393,
         0.01159885,  0.00655206]),
 'free': array([ 0.0090774 , -0.00022874, -0.0002884 , -0.01166282,  0.01693539,
        -0.00220819, -0.00258049]),
 'four': array([-0.01992802,  0.03628376,  0.03707087, -0.00669192, -0.00273951,
        -0.00586849,  0.00305702]),
 'five': array([ 0.01718522, -0.00797481,  0.00859062, -0.00890876, -0.00921552,
         0.02401479, -0.0046938 ]),
 'six': array([ 0.00414751, -0.02942722,  0.00739562,  0.0001866 , -0.00153738,
         0.01957432,  0.01100757])}

In [8]:
your_text = 'hey guys'
train(your_text) #пример 4

Epoch: 0 Loss: 1.9996057562251783
Epoch: 1 Loss: 1.999598035621276
Epoch: 2 Loss: 1.9995901986522846
Epoch: 3 Loss: 1.9995822422016172
Epoch: 4 Loss: 1.9995741631054753
Epoch: 5 Loss: 1.9995659581515968
Epoch: 6 Loss: 1.9995576240779802
Epoch: 7 Loss: 1.9995491575715947
Epoch: 8 Loss: 1.999540555267066
Epoch: 9 Loss: 1.9995318137453437
Epoch: 10 Loss: 1.9995229295323482
Epoch: 11 Loss: 1.9995138990975936
Epoch: 12 Loss: 1.9995047188527906
Epoch: 13 Loss: 1.9994953851504267
Epoch: 14 Loss: 1.9994858942823213
Epoch: 15 Loss: 1.9994762424781594
Epoch: 16 Loss: 1.999466425904
Epoch: 17 Loss: 1.9994564406607576
Epoch: 18 Loss: 1.999446282782662
Epoch: 19 Loss: 1.9994359482356896
Epoch: 20 Loss: 1.999425432915967
Epoch: 21 Loss: 1.9994147326481508
Epoch: 22 Loss: 1.9994038431837744
Epoch: 23 Loss: 1.9993927601995716
Epoch: 24 Loss: 1.9993814792957674
Epoch: 25 Loss: 1.9993699959943383
Epoch: 26 Loss: 1.9993583057372455
Epoch: 27 Loss: 1.9993464038846338
Epoch: 28 Loss: 1.9993342857129992
Epo

{'hey': array([0.00856688, 0.02418843, 0.01080571]),
 'guys': array([ 0.02058638, -0.01435757,  0.00654123])}