# SECTION 1: load db

In [1]:
with open('../part11/labels.txt') as labels:
    target_dataset = [1 if lbl[0] == 'p' else 0 for lbl in labels.readlines()]

with open('../part11/reviews.txt') as reviews:
    text = [line.upper() for line in reviews.readlines()]

vocab = set(' '.join(text).split(' '))  # 74075
vocab.remove('')

word2index = {word: i for i, word in enumerate(vocab)}  # 74075

input_dataset = [[word2index[word] for word in review.split(' ') if word != ''] for review in text]

In [2]:
from sklearn.utils import shuffle
input_dataset_shuffled, target_dataset_shuffled = shuffle(input_dataset, target_dataset)
print('Length dataset samples {}'.format(len(input_dataset_shuffled)))
print('Length positive samples {}'.format(len([t for t in target_dataset_shuffled if t == 1])))
print('Length negative samples {}'.format(len([t for t in target_dataset_shuffled if t == 0])))

Length dataset samples 25000
Length positive samples 12500
Length negative samples 12500


# SECTION 2: fit & predict

In [3]:
import numpy as np
np.random.seed(1)

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
ALPHA = 0.01
ITERATIONS = 5
HIDDEN_SIZE = 100

In [6]:
weights_0_1 = 0.2 * np.random.random((len(vocab), HIDDEN_SIZE)) - 0.1
weights_1_2 = 0.2 * np.random.random((HIDDEN_SIZE, 1)) - 0.1

In [7]:
def fit(x_train, y_train):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for iter in range(ITERATIONS):
        for i in range(len(x_train)):
            x = x_train[i]
            y = y_train[i]
            
            layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
            layer_2_delta = layer_2 - y
            layer_1_delta = layer_2_delta.dot(weights_1_2.T)
            weights_0_1[x] -= layer_1_delta * ALPHA
            weights_1_2 -= np.outer(layer_1,layer_2_delta) * ALPHA
            
            if(np.abs(layer_2_delta) < 0.5):
                correct += 1
            total += 1
        
        print("iter: {}, train: {}".format(iter, correct/total))

In [8]:
fit(input_dataset_shuffled[:-1000], target_dataset_shuffled[:-1000])

iter: 0, train: 0.82025
iter: 1, train: 0.8485625
iter: 2, train: 0.8620833333333333
iter: 3, train: 0.8705833333333334
iter: 4, train: 0.8765166666666667


In [9]:
def predict(x_test, y_test):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for i in range(len(x_test)):
        x = x_test[i]
        y = y_test[i]

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        if(np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1

    print("test: {}".format(correct/total))    

In [10]:
predict(input_dataset_shuffled[-1000:], target_dataset_shuffled[-1000:])

test: 0.866


# SECTION 3: similar reviews

In [11]:
from collections import Counter

In [12]:
tokens = list(map(set, [line.split(' ') for line in text]))
len(tokens[0])

94

In [13]:
len(input_dataset[0])

168

In [14]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)

In [15]:
norms.shape

(74074,)

In [16]:
norms.resize(norms.shape[0], 1)

In [17]:
normed_weights = weights_0_1 * norms

In [18]:
def make_sent_vect(words):
    words = [w.upper() for w in words]
    indices = [word2index[word] for word in words if word in word2index]
    return np.mean(normed_weights[indices], axis=0)

In [19]:
reviews2vectors = np.array([make_sent_vect(review) for review in tokens])

In [20]:
def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    for idx, score in scores.most_common(3):
        most_similar.append(text[idx][:100])
    return most_similar

In [21]:
most_similar_reviews(['boring','awful'])

['THIS IS WITHOUT A DOUBT THE WORST MOVIE I HAVE EVER SEEN . IT IS NOT FUNNY . IT IS NOT INTERESTING A',
 'THIS MOVIE IS SO BAD  IT CAN ONLY BE COMPARED TO THE ALL  TIME WORST  COMEDY   POLICE ACADEMY  . NO ',
 'I  VE SEEN ABOUT    MOVIES RELEASED BETWEEN         AND THE INFORMER IS THE WORST MAJOR RELEASE I  V']

In [22]:
most_similar_reviews(['great','amazing'])

['ADRIAN PASDAR IS EXCELLENT IS THIS FILM . HE MAKES A FASCINATING WOMAN .  \n',
 'EXCELLENT EPISODE MOVIE ALA PULP FICTION .  DAYS   SUICIDES . IT DOESNT GET MORE DEPRESSING THAN THI',
 'BRILLIANT EXECUTION IN DISPLAYING ONCE AND FOR ALL  THIS TIME IN THE VENUE OF POLITICS  OF HOW  GOOD']

# SECTION 4: identity

In [23]:
a = np.array([1, 2, 3])
b = np.array([0.1, 0.2, 0.3])
c = np.array([-1, -0.5, 0])
d = np.array([0, 0, 0])
identity = np.eye(3)

In [24]:
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [25]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


# SECTION 5: forward prop

In [27]:
def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

In [31]:
word_vects = {}
word_vects['yankees'] = np.array([[0., 0., 0.]])
word_vects['bears'] = np.array([[0., 0., 0.]])
word_vects['braves'] = np.array([[0., 0., 0.]])
word_vects['red'] = np.array([[0., 0., 0.]])
word_vects['socks'] = np.array([[0., 0., 0.]])
word_vects['lose'] = np.array([[0., 0., 0.]])
word_vects['defeat'] = np.array([[0., 0., 0.]])
word_vects['beat'] = np.array([[0., 0., 0.]])
word_vects['tie'] = np.array([[0., 0., 0.]])

sent2output = np.random.rand(3, len(word_vects))
identity = np.eye(3)

layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['socks']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

pred = softmax(np.dot(layer_2, sent2output))

In [33]:
sent2output

array([[0.07569479, 0.6977489 , 0.18699438, 0.69077155, 0.51039448,
        0.0973266 , 0.05715437, 0.46879061, 0.34204848],
       [0.22403682, 0.41088015, 0.2084076 , 0.36949591, 0.27988442,
        0.16890045, 0.60158824, 0.74187254, 0.66526819],
       [0.3791373 , 0.38514617, 0.05521076, 0.42665401, 0.84015619,
        0.31561853, 0.43855738, 0.5145177 , 0.43275748]])

In [34]:
identity

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [36]:
np.dot(layer_2, sent2output)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])