In [220]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [221]:
batchSize = 2
lstmUnits = 64
#numClasses = 2  # Binary classification
numClasses = 5

## Word Vectors

In [222]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 300

In [223]:
numDimensions = 300

## Reviews

Load the Reviews

In [224]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [225]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

For development, just use 5000 rows for simplicity and speed

In [226]:
numReviews = 5000

In [227]:
review_df = review_df.loc[0:numReviews-1]

Lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [228]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

Create the array of input sentences converted to word IDs

In [229]:
maxSeqLength = 250  # Determined by EDA

In [230]:
word_ids = np.zeros((numReviews, maxSeqLength), dtype='int32')

Convert words to word IDs and store in word_ids

In [231]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

Create functions to get the train and test batches

In [232]:
# Train with 70%, test with 30%
train_percent = 0.7

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = round(len(review_df)*train_percent)
test_lower_index = train_upper_index+1
test_upper_index = len(review_df)-1

In [233]:
# Global variables
next_train_index = train_lower_index
next_test_index = test_lower_index

TODO: Look into replacing these functions with Tensorflow's data iterators

In [234]:
def getTrainBatch(dataset):
    global next_train_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_train_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_train_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_train_index + i]
        
# Non-binary classification
        label_batch[i] = np.zeros(numClasses)
        label_batch[i][review_df.iloc[next_train_index + i].Score - 1] = 1

# Binary classification
#        if review_df.iloc[next_train_index + i].Score >= 3:
#            label_batch[i] = [0, 1]
#        else:
#            label_batch[i] = [1, 0]
        
    next_train_index = next_train_index + next_batch
    
    return continue_iterating, word_batch, label_batch

def getTestBatch(dataset):
    global next_test_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_test_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_test_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_test_index + i]
        
# Non-binary classification
        label_batch[i] = np.zeros(numClasses)
        label_batch[i][review_df.iloc[next_test_index + i].Score - 1] = 1

# Binary classification
#        if review_df.iloc[next_test_index + i].Score >= 3:
#            label_batch[i] = [0, 1]
#        else:
#            label_batch[i] = [1, 0]

    current_test_index = next_test_index
    next_test_index = next_test_index + next_batch
    
    return continue_iterating, current_test_index, word_batch, label_batch

def resetBatch():
    global next_train_index
    global next_test_index
    
    next_train_index = train_lower_index
    next_test_index = test_lower_index

Reset the batch iterators

In [235]:
resetBatch()

## Tensorflow Model

Hyper Parameters

In [236]:
learningRate = 0.01
dropout_keep_prob = 0.75

Model

In [242]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
    
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)
    
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
    
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]), dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)
    
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)

## Train Model

In [239]:
def TrainModel(session, writer):
    
    i = 0

    while True:
    
        # Next Batch of reviews
        continue_iterating, nextBatch, nextBatchLabels = getTrainBatch(review_df)
    
        feed_dict_ = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        loss_, _ = session.run([loss, optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 500 == 0):
            print("Loss is: ", loss_, "\n")

        i = i + 1

        if not continue_iterating:
            print("Loss is: ", loss_, "\n")
            break

## Test Model

TODO: Need to be able to test one sentence at a time and display the loss

In [244]:
def TestModel(session):
    
    while True:
    
        continue_iterating_, current_test_index_, nextBatch_, nextBatchLabels_ = getTestBatch(review_df);

        feed_dict = {
            input_data: nextBatch_,
            labels: nextBatchLabels_
        }

        #loss_ = sess.run(loss, feed_dict)
        #print("Loss is: ", loss_, "\n")

        accuracy_ = sess.run(accuracy, feed_dict)

        if accuracy_ < 1.0:
            print("Accuracy for this batch: ", accuracy_ * 100, ", Row index of batch: ", current_test_index_)

        if not continue_iterating_:
            break

## Train and Test

In [245]:
import datetime

with tf.Session() as sess:
    #saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    resetBatch()
    
    # Train model
    TrainModel(sess, writer)
    
    # Test model
    TestModel(sess)
    
    writer.close()

Loss is:  1.609438 

Loss is:  1.4075 

Loss is:  1.6253046 

Loss is:  0.536147 

Loss is:  0.3133614 

Loss is:  0.0 

Loss is:  0.0 

Accuracy for this batch:  50.0 , Row index of batch:  3501
Accuracy for this batch:  50.0 , Row index of batch:  3503
Accuracy for this batch:  50.0 , Row index of batch:  3507
Accuracy for this batch:  0.0 , Row index of batch:  3509
Accuracy for this batch:  50.0 , Row index of batch:  3511
Accuracy for this batch:  0.0 , Row index of batch:  3513
Accuracy for this batch:  0.0 , Row index of batch:  3515
Accuracy for this batch:  0.0 , Row index of batch:  3517
Accuracy for this batch:  50.0 , Row index of batch:  3519
Accuracy for this batch:  0.0 , Row index of batch:  3527
Accuracy for this batch:  50.0 , Row index of batch:  3531
Accuracy for this batch:  50.0 , Row index of batch:  3533
Accuracy for this batch:  0.0 , Row index of batch:  3535
Accuracy for this batch:  0.0 , Row index of batch:  3539
Accuracy for this batch:  50.0 , Row index o

Accuracy for this batch:  50.0 , Row index of batch:  4095
Accuracy for this batch:  50.0 , Row index of batch:  4097
Accuracy for this batch:  0.0 , Row index of batch:  4099
Accuracy for this batch:  0.0 , Row index of batch:  4101
Accuracy for this batch:  50.0 , Row index of batch:  4103
Accuracy for this batch:  50.0 , Row index of batch:  4105
Accuracy for this batch:  50.0 , Row index of batch:  4107
Accuracy for this batch:  0.0 , Row index of batch:  4109
Accuracy for this batch:  50.0 , Row index of batch:  4111
Accuracy for this batch:  50.0 , Row index of batch:  4115
Accuracy for this batch:  50.0 , Row index of batch:  4117
Accuracy for this batch:  0.0 , Row index of batch:  4119
Accuracy for this batch:  50.0 , Row index of batch:  4121
Accuracy for this batch:  50.0 , Row index of batch:  4123
Accuracy for this batch:  0.0 , Row index of batch:  4125
Accuracy for this batch:  50.0 , Row index of batch:  4127
Accuracy for this batch:  50.0 , Row index of batch:  4129
Ac

Accuracy for this batch:  50.0 , Row index of batch:  4579
Accuracy for this batch:  0.0 , Row index of batch:  4581
Accuracy for this batch:  0.0 , Row index of batch:  4583
Accuracy for this batch:  0.0 , Row index of batch:  4585
Accuracy for this batch:  50.0 , Row index of batch:  4587
Accuracy for this batch:  0.0 , Row index of batch:  4589
Accuracy for this batch:  0.0 , Row index of batch:  4591
Accuracy for this batch:  50.0 , Row index of batch:  4593
Accuracy for this batch:  50.0 , Row index of batch:  4595
Accuracy for this batch:  50.0 , Row index of batch:  4599
Accuracy for this batch:  50.0 , Row index of batch:  4601
Accuracy for this batch:  50.0 , Row index of batch:  4609
Accuracy for this batch:  0.0 , Row index of batch:  4611
Accuracy for this batch:  0.0 , Row index of batch:  4613
Accuracy for this batch:  50.0 , Row index of batch:  4615
Accuracy for this batch:  50.0 , Row index of batch:  4617
Accuracy for this batch:  50.0 , Row index of batch:  4621
Accu