In [41]:
with open('Grokking-Deep-Learning/reviews.txt', encoding='utf-8') as file:
    raw_reviews = file.readlines()                    # отзывы
    
with open('Grokking-Deep-Learning/labels.txt', encoding='utf-8') as file:
    raw_labels = file.readlines()                    # метка отзыва (positive/negative)
    
tokens = list(map(lambda x: set(x.split(' ')), raw_reviews))   # список множеств содержащих слова из каждого отзыва

vocab = set()
for sent in tokens:               # попробовать сделать через объединение множеств
    for word in sent:
        if len(word) > 0:
            vocab.add(word)

vocab = list(vocab)               # список слов встретившихся во всех отзывах

word2index = {}                   # словарь: слово --> индекс (уникальный номер в словаре)

for i, word in enumerate(vocab):
    word2index[word] = i
    
input_dataset = []

for sent in tokens:
    sent_indices = []
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ''
    input_dataset.append(list(set(sent_indices)))  # список отзывов закодированных числами (цифровая копия tokens)
    
target_dataset = []                                # список меток отзывов: positive - 1, negative - 0

for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)
        

In [45]:
import numpy as np
import sys
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

correct,total = (0,0)
for iter in range(iterations):
    
    # train on first 24,000
    for i in range(len(input_dataset)-1000):

        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2)) # linear + softmax

        layer_2_delta = layer_2 - y # compare pred with truth
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #backprop

        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha

        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            
            sys.stdout.write('\rIter:'+str(iter)\
                             +' Progress:'+progress[2:4]\
                             +'.'+progress[4:6]\
                             +'% Training Accuracy:'\
                             + str(correct/float(total)) + '%')
    print()
correct,total = (0,0)
for i in range(len(input_dataset)-1000,len(input_dataset)):

    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:95.99% Training Accuracy:0.8346666666666667%%
Iter:1 Progress:95.99% Training Accuracy:0.8675625%84538445%
Test Accuracy:0.845


In [49]:
from collections import Counter
import math 

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

In [50]:
print(similar('beautiful'))

[('beautiful', -0.0), ('magic', -0.6860003876370212), ('sent', -0.7380023111650409), ('friendship', -0.7547921878321959), ('superbly', -0.7616060024858399), ('awesome', -0.7713603021761793), ('hilarious', -0.7757668469644036), ('ride', -0.788171403293333), ('thank', -0.7972649245469486), ('episodes', -0.8056853550657797)]


In [52]:
len(input_dataset[0])

93

In [55]:
weights_0_1[[0, 4]]

array([[-0.00622563,  0.04655057, -0.0966767 , -0.03780697, -0.06259451,
        -0.08430637, -0.05914263, -0.03093009, -0.02492853,  0.00841967,
        -0.01241428,  0.04135099, -0.05392465,  0.08402988, -0.09835892,
         0.03405671, -0.00946867,  0.00774975, -0.06476643, -0.05362676,
         0.05133336,  0.09338443, -0.04657866,  0.03560874,  0.07532658,
         0.07067494, -0.07568862, -0.08889746, -0.06371385,  0.07126973,
        -0.07935867, -0.01156049,  0.09445451,  0.01021923,  0.03060122,
        -0.03250844,  0.04234055,  0.06529588, -0.10472489,  0.04446472,
         0.09264704,  0.0533342 , -0.04234498,  0.0519071 , -0.08466537,
        -0.00213631,  0.08944431, -0.0347825 , -0.04872845, -0.08538279,
        -0.09868626,  0.03672784, -0.05941113, -0.05532201, -0.00049988,
        -0.09480089,  0.01126013, -0.08154934,  0.01407197,  0.04803115,
        -0.07249587, -0.02082742,  0.04027172, -0.01620586, -0.08247723,
         0.00130591,  0.02782423,  0.00101702,  0.0