# SECTION 1: Import

In [6]:
import numpy as np

In [4]:
with open('labels.txt') as labels:
    target_dataset = [1 if lbl[0]=='p' else 0 for lbl in labels.readlines()]

In [7]:
np.array(target_dataset).shape

(25000,)

In [8]:
with open('reviews.txt') as reviews:
    raw_reviews = [review.upper() for review in reviews.readlines()]

In [9]:
np.array(raw_reviews).shape

(25000,)

In [12]:
words = set(' '.join(raw_reviews).split(' '))

In [19]:
word2index = {word: i for i, word in enumerate(words)}

In [20]:
word2index

{'': 0,
 'HANDYMAN': 1,
 'BEDROOMS': 2,
 'CRAWL': 3,
 'TUSSLES': 4,
 'STUDIOUSLY': 5,
 'INTERMEDIATE': 6,
 'LONGEVITY': 7,
 'TELEPORTATION': 8,
 'DOPES': 9,
 'CRUCIFIES': 10,
 'MACMURPHY': 11,
 'INCINERATES': 12,
 'VALID': 13,
 'CAB': 14,
 'HODGES': 15,
 'HOLMAN': 16,
 'COGSWORTH': 17,
 'YAKMALLAH': 18,
 'SUZU': 19,
 'CAMPUS': 20,
 'YUMA': 21,
 'DEPRESSES': 22,
 'LEENA': 23,
 'HEADTRIPPING': 24,
 'DHRY': 25,
 'ISRAELO': 26,
 'OLDS': 27,
 'POCASNI': 28,
 'ARCHIPELAGO': 29,
 'ENABLING': 30,
 'VAPOORIZE': 31,
 'GLACIALLY': 32,
 'PREFERRED': 33,
 'KEEL': 34,
 'BIGTIME': 35,
 'BENNY': 36,
 'YOUNGEST': 37,
 'INTENSIFIES': 38,
 'WINDSWEPT': 39,
 'EMRAAN': 40,
 'GJON': 41,
 'GRAZIA': 42,
 'SQUIBS': 43,
 'SEQUITURS': 44,
 'OHANA': 45,
 'ZIVAGHO': 46,
 'UNPALATABLY': 47,
 'MASTERCARD': 48,
 'POUCHY': 49,
 'NATIVIDAD': 50,
 'SERVICEMEN': 51,
 'THICKENS': 52,
 'WAINRIGHTS': 53,
 'LEVENS': 54,
 'CHERIE': 55,
 'JOY': 56,
 'EYEBROWS': 57,
 'SOFTLY': 58,
 'CONSISTANCY': 59,
 'LOVEABILITY': 60,
 'BOYUM

In [21]:
input_dataset = [[word2index[word] for word in review.split(' ')] for review in raw_reviews]

In [22]:
input_dataset[0]

[34015,
 37545,
 36198,
 39465,
 58413,
 39928,
 57587,
 29660,
 72073,
 20640,
 58974,
 6934,
 47512,
 4349,
 69789,
 5769,
 4034,
 11025,
 62813,
 46733,
 0,
 18114,
 4349,
 0,
 17671,
 0,
 57587,
 42680,
 0,
 0,
 57067,
 43093,
 58974,
 44325,
 53121,
 64022,
 58854,
 63656,
 66166,
 63658,
 34015,
 37545,
 0,
 30902,
 57246,
 36198,
 60615,
 1514,
 63656,
 70326,
 42396,
 36198,
 0,
 17671,
 0,
 57587,
 58974,
 21732,
 63656,
 51122,
 73688,
 0,
 58974,
 50384,
 46905,
 52713,
 33789,
 19968,
 8088,
 54064,
 17865,
 36407,
 17671,
 0,
 38739,
 0,
 58974,
 543,
 65919,
 58974,
 27541,
 25338,
 0,
 17220,
 15297,
 58854,
 65919,
 58974,
 22705,
 57997,
 61394,
 70688,
 17865,
 46905,
 57587,
 24921,
 57997,
 63606,
 58974,
 37227,
 43093,
 30941,
 39465,
 57809,
 72916,
 69170,
 63656,
 6044,
 34291,
 58974,
 62813,
 0,
 57997,
 24999,
 48400,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 20640,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587

# SECTION 2: network

In [23]:
np.random.seed(1)

In [24]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [26]:
ALPHA = 0.01
ITERATIONS = 2
HIDDEN_SIZE = 100

In [35]:
weights_0_1 = 0.2 * np.random.random((len(words), HIDDEN_SIZE)) - 0.1
weights_1_2 = 0.2 * np.random.random((HIDDEN_SIZE, 1)) - 0.1

In [44]:
def fit(x_train, y_train):
    correct = 0
    total = 0
    
    for iter in range(ITERATIONS):
        for i in range(len(x_train)):
            x = input_dataset[i]
            y = target_dataset[i]
            
            global weights_0_1, weights_1_2
            
            layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0))
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
            
            layer_2_delta = layer_2 - y
            layer_1_delta = layer_2_delta.dot(weights_1_2.T)
            
            weights_0_1[x] -= layer_1_delta * ALPHA
            weights_1_2 -= np.outer(layer_1, layer_2_delta) * ALPHA
            
            if (np.abs(layer_2_delta) < 0.5):
                correct += 1
            total += 1
        print('iter: {}, train: {}'.format(iter, correct/total))

In [45]:
fit(input_dataset[:-1000], target_dataset[:-1000])

iter: 0, train: 0.8082083333333333
iter: 1, train: 0.8418958333333333


In [48]:
def predict(x_test, y_test):
    correct = 0
    total = 0

    for i in range(len(x_test)):
    
        x = input_dataset[i]
        y = target_dataset[i]

        global weights_0_1, weights_1_2

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        if (np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1
    print('test: {}'.format(correct/total))

In [49]:
predict(input_dataset[-1000:], target_dataset[-1000:])

test: 0.877
