# SECTION 1: load db

In [3]:
with open('../part11/labels.txt') as labels:
    target_dataset = [1 if lbl[0] == 'p' else 0 for lbl in labels.readlines()]

with open('../part11/reviews.txt') as reviews:
    text = [line.upper() for line in reviews.readlines()]

vocab = set(' '.join(text).split(' '))  # 74075
vocab.remove('')

word2index = {word: i for i, word in enumerate(vocab)}  # 74075

input_dataset = [[word2index[word] for word in review.split(' ') if word != ''] for review in text]

In [5]:
from sklearn.utils import shuffle
input_dataset_shuffled, target_dataset_shuffled = shuffle(input_dataset, target_dataset)
print('Length dataset samples {}'.format(len(input_dataset_shuffled)))
print('Length positive samples {}'.format(len([t for t in target_dataset_shuffled if t == 1])))
print('Length negative samples {}'.format(len([t for t in target_dataset_shuffled if t == 0])))

Length dataset samples 25000
Length positive samples 12500
Length negative samples 12500


# SECTION 2: fit & predict

In [8]:
import numpy as np
np.random.seed(1)

In [9]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [10]:
ALPHA = 0.01
ITERATIONS = 5
HIDDEN_SIZE = 100

In [11]:
weights_0_1 = 0.2 * np.random.random((len(vocab), HIDDEN_SIZE)) - 0.1
weights_1_2 = 0.2 * np.random.random((HIDDEN_SIZE, 1)) - 0.1

In [14]:
def fit(x_train, y_train):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for iter in range(ITERATIONS):
        for i in range(len(x_train)):
            x = x_train[i]
            y = y_train[i]
            
            layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
            layer_2_delta = layer_2 - y
            layer_1_delta = layer_2_delta.dot(weights_1_2.T)
            weights_0_1[x] -= layer_1_delta * ALPHA
            weights_1_2 -= np.outer(layer_1,layer_2_delta) * ALPHA
            
            if(np.abs(layer_2_delta) < 0.5):
                correct += 1
            total += 1
        
        print("iter: {}, train: {}".format(iter, correct/total))

In [15]:
fit(input_dataset_shuffled[:-1000], target_dataset_shuffled[:-1000])

iter: 0, train: 0.817875
iter: 1, train: 0.8469583333333334
iter: 2, train: 0.8614166666666667
iter: 3, train: 0.87021875
iter: 4, train: 0.8765


In [16]:
def predict(x_test, y_test):
    correct = 0
    total = 0
    
    global weights_0_1, weights_1_2
    
    for i in range(len(x_test)):
        x = x_test[i]
        y = y_test[i]

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        if(np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1

    print("test: {}".format(correct/total))    

In [17]:
predict(input_dataset_shuffled[-1000:], target_dataset_shuffled[-1000:])

test: 0.865


# SECTION 3: similar reviews

In [60]:
from collections import Counter

In [56]:
tokens = list(map(set, [line.split(' ') for line in text]))
len(tokens[0])

94

In [53]:
len(input_dataset[0])

168

In [18]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)

In [19]:
norms.shape

(74074,)

In [24]:
norms.resize(norms.shape[0], 1)

In [27]:
normed_weights = weights_0_1 * norms

In [41]:
def make_sent_vect(words):
    words = [w.upper() for w in words]
    indices = [word2index[word] for word in words if word in word2index]
    return np.mean(normed_weights[indices], axis=0)

In [57]:
reviews2vectors = np.array([make_sent_vect(review) for review in tokens])

In [63]:
def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    for idx, score in scores.most_common(3):
        most_similar.append(text[idx][:100])
    return most_similar

In [64]:
most_similar_reviews(['boring','awful'])

['THIS IS WITHOUT A DOUBT THE WORST MOVIE I HAVE EVER SEEN . IT IS NOT FUNNY . IT IS NOT INTERESTING A',
 'THIS MOVIE IS SO BAD  IT CAN ONLY BE COMPARED TO THE ALL  TIME WORST  COMEDY   POLICE ACADEMY  . NO ',
 'WITHOUT QUESTION  THE WORST ELVIS FILM EVER MADE . THE MOVIE PORTRAYS ALL INDIANS AS DRUNK  STUPID  ']

In [65]:
most_similar_reviews(['great','amazing'])

['ADRIAN PASDAR IS EXCELLENT IS THIS FILM . HE MAKES A FASCINATING WOMAN .  \n',
 'EXCELLENT EPISODE MOVIE ALA PULP FICTION .  DAYS   SUICIDES . IT DOESNT GET MORE DEPRESSING THAN THI',
 'THIS FILM HAS GOOD CHARACTERS WITH EXCELLENT PERFORMANCES FROM THE CAST . DAVID STRATHAIRN IS DIABOL']