# SECTION 1: load db

In [1]:
with open('../part11/labels.txt') as labels:
    target_dataset = [1 if lbl[0] == 'p' else 0 for lbl in labels.readlines()]

with open('../part11/reviews.txt') as reviews:
    text = [line.upper() for line in reviews.readlines()]

vocab = set(' '.join(text).split(' '))  # 74075
vocab.remove('')

word2index = {word: i for i, word in enumerate(vocab)}  # 74075

input_dataset = [[word2index[word] for word in review.split(' ') if word != ''] for review in text]

In [2]:
from sklearn.utils import shuffle
input_dataset_shuffled, target_dataset_shuffled = shuffle(input_dataset, target_dataset)
print('Length dataset samples {}'.format(len(input_dataset_shuffled)))
print('Length positive samples {}'.format(len([t for t in target_dataset_shuffled if t == 1])))
print('Length negative samples {}'.format(len([t for t in target_dataset_shuffled if t == 0])))

Length dataset samples 25000
Length positive samples 12500
Length negative samples 12500


# SECTION 2: fit & predict

In [3]:
import numpy as np
np.random.seed(1)

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
ALPHA = 0.01
ITERATIONS = 5
HIDDEN_SIZE = 100

In [6]:
weights_0_1 = 0.2 * np.random.random((len(vocab), HIDDEN_SIZE)) - 0.1
weights_1_2 = 0.2 * np.random.random((HIDDEN_SIZE, 1)) - 0.1

In [7]:
def fit(x_train, y_train):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for iter in range(ITERATIONS):
        for i in range(len(x_train)):
            x = x_train[i]
            y = y_train[i]
            
            layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
            layer_2_delta = layer_2 - y
            layer_1_delta = layer_2_delta.dot(weights_1_2.T)
            weights_0_1[x] -= layer_1_delta * ALPHA
            weights_1_2 -= np.outer(layer_1,layer_2_delta) * ALPHA
            
            if(np.abs(layer_2_delta) < 0.5):
                correct += 1
            total += 1
        
        print("iter: {}, train: {}".format(iter, correct/total))

In [8]:
fit(input_dataset_shuffled[:-1000], target_dataset_shuffled[:-1000])

iter: 0, train: 0.82025
iter: 1, train: 0.8485625
iter: 2, train: 0.8620833333333333
iter: 3, train: 0.8705833333333334
iter: 4, train: 0.8765166666666667


In [9]:
def predict(x_test, y_test):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for i in range(len(x_test)):
        x = x_test[i]
        y = y_test[i]

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        if(np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1

    print("test: {}".format(correct/total))    

In [10]:
predict(input_dataset_shuffled[-1000:], target_dataset_shuffled[-1000:])

test: 0.866


# SECTION 3: similar reviews

In [11]:
from collections import Counter

In [12]:
tokens = list(map(set, [line.split(' ') for line in text]))
len(tokens[0])

94

In [13]:
len(input_dataset[0])

168

In [14]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)

In [15]:
norms.shape

(74074,)

In [16]:
norms.resize(norms.shape[0], 1)

In [17]:
normed_weights = weights_0_1 * norms

In [18]:
def make_sent_vect(words):
    words = [w.upper() for w in words]
    indices = [word2index[word] for word in words if word in word2index]
    return np.mean(normed_weights[indices], axis=0)

In [19]:
reviews2vectors = np.array([make_sent_vect(review) for review in tokens])

In [20]:
def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    for idx, score in scores.most_common(3):
        most_similar.append(text[idx][:100])
    return most_similar

In [21]:
most_similar_reviews(['boring','awful'])

['THIS IS WITHOUT A DOUBT THE WORST MOVIE I HAVE EVER SEEN . IT IS NOT FUNNY . IT IS NOT INTERESTING A',
 'THIS MOVIE IS SO BAD  IT CAN ONLY BE COMPARED TO THE ALL  TIME WORST  COMEDY   POLICE ACADEMY  . NO ',
 'I  VE SEEN ABOUT    MOVIES RELEASED BETWEEN         AND THE INFORMER IS THE WORST MAJOR RELEASE I  V']

In [22]:
most_similar_reviews(['great','amazing'])

['ADRIAN PASDAR IS EXCELLENT IS THIS FILM . HE MAKES A FASCINATING WOMAN .  \n',
 'EXCELLENT EPISODE MOVIE ALA PULP FICTION .  DAYS   SUICIDES . IT DOESNT GET MORE DEPRESSING THAN THI',
 'BRILLIANT EXECUTION IN DISPLAYING ONCE AND FOR ALL  THIS TIME IN THE VENUE OF POLITICS  OF HOW  GOOD']

# SECTION 4: identity

In [23]:
a = np.array([1, 2, 3])
b = np.array([0.1, 0.2, 0.3])
c = np.array([-1, -0.5, 0])
d = np.array([0, 0, 0])
identity = np.eye(3)

In [24]:
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [25]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


# SECTION 5: forward prop

In [27]:
def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

In [31]:
word_vects = {}
word_vects['yankees'] = np.array([[0., 0., 0.]])
word_vects['bears'] = np.array([[0., 0., 0.]])
word_vects['braves'] = np.array([[0., 0., 0.]])
word_vects['red'] = np.array([[0., 0., 0.]])
word_vects['socks'] = np.array([[0., 0., 0.]])
word_vects['lose'] = np.array([[0., 0., 0.]])
word_vects['defeat'] = np.array([[0., 0., 0.]])
word_vects['beat'] = np.array([[0., 0., 0.]])
word_vects['tie'] = np.array([[0., 0., 0.]])

sent2output = np.random.rand(3, len(word_vects))
identity = np.eye(3)

layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['socks']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

pred = softmax(np.dot(layer_2, sent2output))

# SECTION 6: back prop

In [37]:
y = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0])
pred_delta = pred - y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['socks'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha
identity -= np.outer(layer_0, layer_1_delta) * alpha
identity -= np.outer(layer_1, layer_2_delta) * alpha
sent2output -= np.outer(layer_2, pred_delta) * alpha

# SECTION 7: babi

In [38]:
! wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-1.tar.gz
! tar -xvf tasks_1-20_v1-1.tar.gz

--2021-09-21 04:20:37--  http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-1.tar.gz
Распознаётся www.thespermwhale.com (www.thespermwhale.com)… 50.31.160.191
Подключение к www.thespermwhale.com (www.thespermwhale.com)|50.31.160.191|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 1282454 (1,2M) [application/x-gzip]
Сохранение в: «tasks_1-20_v1-1.tar.gz»


2021-09-21 04:20:39 (1,12 MB/s) - «tasks_1-20_v1-1.tar.gz» сохранён [1282454/1282454]

tar: Игнорируется неизвестное ключевое слово расширенного заголовка «LIBARCHIVE.creationtime»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.ino»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.nlink»
tasksv11/
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.ino»

tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.ino»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.nlink»
tasksv11/en/qa10_indefinite-knowledge_train.txt
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.ino»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.nlink»
tasksv11/en/qa11_basic-coreference_test.txt
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.ino»
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.nlink»
tasksv11/en/qa11_basic-coreference_train.txt
tar: Игнорируется неизвестное ключевое слово расширенного заголовка «SCHILY.dev»
tar: Игнорируется неизвестное ключ

In [89]:
import re

with open('tasksv11/en/qa1_single-supporting-fact_train.txt') as f:
    raw = f.readlines()

tokens = list()
for line in raw[:1000]:
    opt = re.sub(r'[^\w\s]', '', line.lower())
    tokens.append(opt.replace('\n', '').replace('\t', ' ').split(' ')[1:])

In [90]:
raw[:3]

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n']

In [91]:
tokens[:3]

[['mary', 'moved', 'to', 'the', 'bathroom'],
 ['john', 'went', 'to', 'the', 'hallway'],
 ['where', 'is', 'mary', '', 'bathroom', '1']]

In [92]:
vocab = {word for sent in tokens for word in sent}
vocab = list(vocab)

In [93]:
word2index = {word: i for i, word in enumerate(vocab)}

In [94]:
def word2indices(sentence):
    return [word2index[word] for word in sentence]

In [95]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [96]:
np.random.seed(1)

In [97]:
EMBED_SIZE = 10

In [98]:
embed = (np.random.rand(len(vocab), EMBED_SIZE) - 0.5) * 0.1
recurrent = np.eye(EMBED_SIZE)
start = np.zeros(EMBED_SIZE)
decoder = (np.random.rand(EMBED_SIZE, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))

In [99]:
def predict(sent):
    layer = dict()
    layer['hidden'] = start
    layers = list()
    layers.append(layer)
    
    loss = 0
    preds = list()
    for target_i in range(len(sent)):
        layer = dict()
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    
    return layers, loss

In [100]:
for iter in range(30000):
    ALPHA = 0.001
    sent = word2indices(tokens[iter % len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx - 1]
        if(layer_idx > 0):
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.T)
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.T)
        else:
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.T)
            
    start -= layers[0]['hidden_delta'] * ALPHA / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'],\
        layer['output_delta']) * ALPHA / float(len(sent))
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * ALPHA / float(len(sent))
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * ALPHA / float(len(sent))
        if(iter % 1000 == 0):
            print("Perplexity:" + str(np.exp(loss/len(sent))))            

Perplexity:30.038921571864865
Perplexity:30.038921571864865
Perplexity:30.038921571864865
Perplexity:30.038921571864865
Perplexity:29.975890308811596
Perplexity:29.975890308811596
Perplexity:29.975890308811596
Perplexity:29.975890308811596
Perplexity:29.906466931910458
Perplexity:29.906466931910458
Perplexity:29.906466931910458
Perplexity:29.906466931910458
Perplexity:29.809777852122263
Perplexity:29.809777852122263
Perplexity:29.809777852122263
Perplexity:29.809777852122263
Perplexity:29.653315202674566
Perplexity:29.653315202674566
Perplexity:29.653315202674566
Perplexity:29.653315202674566
Perplexity:29.37624181809044
Perplexity:29.37624181809044
Perplexity:29.37624181809044
Perplexity:29.37624181809044
Perplexity:28.846207331927058
Perplexity:28.846207331927058
Perplexity:28.846207331927058
Perplexity:28.846207331927058
Perplexity:27.714714925195725
Perplexity:27.714714925195725
Perplexity:27.714714925195725
Perplexity:27.714714925195725
Perplexity:24.799951472333888
Perplexity:24.

In [102]:
sent_index = 4
l,_ = predict(word2indices(tokens[sent_index]))
print(tokens[sent_index])
for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) +\
    "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden']
Prev Input:sandra      True:moved          Pred:to
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden         Pred:bedroom
