In [2]:
import numpy as np
from pdb import set_trace
from sys import stdout

In [3]:

# from time import sleep
# for i in range(1,20):
#     stdout.write("\r%d" % i)
#     sleep(0.5)

In [4]:
g = open('./reviews.txt', mode='r')
reviews = [ r[:-1].lower() for r in g.readlines() ]
g.close()

h = open('./labels.txt', mode='r')
labels = [ r[:-1].upper() for r in h.readlines() ]
h.close()

reviews, labels = reviews[0:100], labels[0:100]

In [5]:
reviews[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '

In [6]:
labels[0]

'POSITIVE'

In [7]:
all_words = set() # Store all the words in the reviews
for review in reviews:
    for word in review.split(' '):
        all_words.add(word)

word2index = {} # I'm using this to store the indices of all the words
for i, word in enumerate(all_words):
    word2index[word] = i

In [8]:
len(word2index)

4450

In [9]:
vocab_length = len(all_words)
input_vector = np.zeros((1, vocab_length))

In [10]:
# Function to convert each review into a 74074 vector
def get_review_vector(review, input_vector):
    review = review.lower()
    input_vector *= 0
    for word in review.split(' '):
        input_vector[0][word2index[word]] = 1
    return input_vector[0][None, :]



In [11]:
get_review_vector(reviews[1], input_vector)

array([[1., 0., 0., ..., 0., 0., 0.]])

In [12]:
def get_label_value(label):
    return 1 if label == "POSITIVE" else 0

In [13]:
def get_review_indices(review):
    review_indices = set()
    for word in review.split(' '):
        review_indices.add(word2index[word])
    return list(review_indices)

In [14]:
def make_hidden_layer(review_indices):
    pass

In [15]:
get_label_value(labels[0]), get_label_value(labels[1])

(1, 0)

In [16]:
split = 0.99
training_split = int(split * len(reviews))

training_data, training_label = reviews[0:training_split], labels[0:training_split]
valid_data, valid_label = reviews[training_split:], labels[training_split:]

In [17]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid_deriv(z):
    return z * (1 - z)

def relu(x):
    x[x > 0] = x
    x[x <= 0] = 0.001
    return x

def relu_deriv(x):
    x[x > 0] = 1
    x[x <= 0] = 0
    return x

In [18]:
def get_pred(output):
    return 1 if output >= 0.5 else 0

In [None]:
epochs = 1000
hidden_1_size = 100
hidden_2_size = 150

lr = 1e-3

weights_1 = np.zeros((vocab_length, hidden_1_size)) # start with random values
weights_2 = np.random.normal(0.0, 1.0, (hidden_1_size, hidden_2_size))
weights_3 = np.random.normal(0.0, 1.0, (hidden_2_size, 1))

for e in range(epochs):
    training_losses = []
    train_correct = 0
    for review, label in zip(training_data, training_label):
        review_vector = get_review_vector(review, input_vector)
        hidden_1_input = np.dot(review_vector, weights_1)
        hidden_1_output = sigmoid(hidden_1_input)
        
        hidden_2_input = np.dot(hidden_1_output, weights_2)
        hidden_2_output = sigmoid(hidden_2_input)
        
        final_layer_input = np.dot(hidden_2_output, weights_3)
        final_layer_output = sigmoid(final_layer_input)
        
        if get_pred(final_layer_output) == get_label_value(label):
            train_correct += 1
        
        output_error = final_layer_output - get_label_value(label)
        output_error_delta = output_error * sigmoid_deriv(final_layer_output)
        
        hidden_2_error = output_error_delta * weights_3.T
        hidden_2_error_delta = hidden_2_error * sigmoid_deriv(hidden_2_output)
        
        hidden_1_error = np.dot(hidden_2_error_delta, weights_2.T)
        hidden_1_error_delta = hidden_1_error * sigmoid_deriv(hidden_1_output)
        
        weights_3_delta = output_error_delta.T * hidden_2_output
        weights_2_delta = hidden_2_error_delta.T * hidden_1_output
        weights_1_delta = hidden_1_error_delta.T * review_vector
        
        weights_3 -= lr * weights_3_delta.T
        weights_2 -= lr * weights_2_delta.T
        weights_1 -= lr * weights_1_delta.T
        
        training_losses.extend( (final_layer_output-get_label_value(label))**2 )
    
    training_loss_mean = np.mean(training_losses)

    if e % 1 == 0:
        valid_losses = []
        val_correct = 0
        for rev, lab in zip(valid_data, valid_label):
            rev_vector = get_review_vector(rev, input_vector)
            h_1_input = np.dot(rev_vector, weights_1)
            h_1_output = sigmoid(h_1_input)
            
            h_2_input = np.dot(h_1_output, weights_2)
            h_2_output = sigmoid(h_2_input)
            
            final_input = np.dot(h_2_output, weights_3)
            final_output = sigmoid(final_input)
            
            if get_pred(final_output) == get_label_value(lab):
                val_correct += 1
            
            valid_losses.extend( (final_output-get_label_value(lab))**2 )
        valid_loss_mean = np.mean(valid_losses)
    
        string = "Epochs:{}/{} - Training Loss:{:.3f} - Validation Loss:{:.3f} - Training Acc:{:.3f}% - Validation Acc:{:.3f}%".format(
            e+1, epochs,
            training_loss_mean, valid_loss_mean,
            100*float(train_correct)/len(training_data),
            100*float(val_correct)/len(valid_data)
        )
        # print(string)
        stdout.write("\r" + string)
            
            
