In [2]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [3]:
batchSize = 24
#lstmUnits = 64 # Not used
numClasses = 2  # Binary classification
#numClasses = 5
hiddenSize = 50

## Word Vectors

In [4]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/dal7p/.kaggle/wordvectors/pretrained_glove/wordsList.npy'

Word Vectors have dimension 50

In [None]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

For development, just use 5000 rows for simplicity and speed

In [None]:
#numReviews = 20000

In [None]:
#review_df = review_df.loc[0:numReviews-1]

Lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [None]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

Create the array of input sentences converted to word IDs

In [None]:
maxSeqLength = 250  # Determined by EDA

In [None]:
word_ids = np.zeros((review_df.shape[0], maxSeqLength), dtype='int32')

Convert words to word IDs and store in word_ids

In [None]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

Create functions to get the train and test batches

In [None]:
# Train with 70%, test with 30%
train_percent = 0.7

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = round(len(review_df)*train_percent)
test_lower_index = train_upper_index+1
test_upper_index = len(review_df)-1

In [None]:
# Global variables
next_train_index = train_lower_index
next_test_index = test_lower_index

TODO: Look into replacing these functions with Tensorflow's data iterators

In [None]:
def getTrainBatch(dataset):
    global next_train_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_train_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_train_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_train_index + i]
        
# Non-binary classification
#        label_batch[i] = np.zeros(numClasses)
#        label_batch[i][review_df.iloc[next_train_index + i].Score - 1] = 1

# Binary classification
        if review_df.iloc[next_train_index + i].Score >= 3:
            label_batch[i] = [0, 1]
        else:
            label_batch[i] = [1, 0]
        
    next_train_index = next_train_index + next_batch
    
    return continue_iterating, word_batch, label_batch

def getTestBatch(dataset):
    global next_test_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_test_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_test_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_test_index + i]
        
# Non-binary classification
#        label_batch[i] = np.zeros(numClasses)
#        label_batch[i][review_df.iloc[next_test_index + i].Score - 1] = 1

# Binary classification
        if review_df.iloc[next_test_index + i].Score >= 3:
            label_batch[i] = [0, 1]
        else:
            label_batch[i] = [1, 0]

    current_test_index = next_test_index
    next_test_index = next_test_index + next_batch
    
    return continue_iterating, current_test_index, word_batch, label_batch

def resetBatch():
    global next_train_index
    global next_test_index
    
    next_train_index = train_lower_index
    next_test_index = test_lower_index

Reset the batch iterators

In [None]:
resetBatch()

## Tensorflow Model

Hyper Parameters

In [None]:
learning_rate = 0.01
dropout_keep_prob = 0.75

Model

In [None]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenSize)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
rnn_out, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

# Get the output of the last RNN cell
rnn_out = tf.transpose(rnn_out, [1, 0, 2])
last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

# Calculate logits
logits = (tf.matmul(last_cell_out, W_out) + b_out)

# Calculate prediction and accuracy
prediction = tf.argmax(logits,1)
correctPred = tf.equal(prediction, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Loss function and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

## Train Model

In [None]:
import datetime

def TrainModel(session, writer):
    
    i = 0
    start_time = datetime.datetime.now()

    while True:
    
        # Next Batch of reviews
        continue_iterating, nextBatch, nextBatchLabels = getTrainBatch(review_df)
    
        feed_dict_ = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        loss_, _ = session.run([loss, optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 500 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds")

        i = i + 1

        if not continue_iterating:
            print("Loss is: ", loss_, "\n")
            break

## Test Model

In [None]:
def TestModel(session):
    
    while True:
    
        continue_iterating_, current_test_index_, nextBatch_, nextBatchLabels_ = getTestBatch(review_df);

        feed_dict = {
            input_data: nextBatch_,
            labels: nextBatchLabels_
        }

        #loss_ = sess.run(loss, feed_dict)
        #print("Loss is: ", loss_, "\n")

        accuracy_ = sess.run(accuracy, feed_dict)

        if accuracy_ < 1.0:
            print("Accuracy for this batch: ", accuracy_ * 100)
            
            predictions_ = sess.run(prediction, feed_dict)
            
            for index in range(len(predictions_)):

                if predictions_[index] != np.argmax(nextBatchLabels_[index]):
                    print("Sentence mispredicted: ", current_test_index_+index)
        
        if not continue_iterating_:
            break

## Train and Test

In [None]:
import datetime

with tf.Session() as sess:
    #saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    resetBatch()
    
    # Train model
    TrainModel(sess, writer)
    
    # Test model
    TestModel(sess)
    
    writer.close()