# Binary Sentiment Classification of Amazon Food Reviews

In [90]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [91]:
batchSize = 24
numClasses = 2  # Binary classification
hiddenSize = 50

assert(batchSize % numClasses == 0)

## Word Vectors

In [92]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [93]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [94]:
import nltk
from nltk.tokenize import word_tokenize

In [95]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

Use the one and five star reviews

In [96]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index(inplace=True)

five_df = review_df[review_df.Score == 5]
five_df.reset_index(inplace=True)

Limit number of ratings for development

In [97]:
max_num_ratings = 12500
one_df = one_df[0:max_num_ratings]
five_df = five_df[0:max_num_ratings]

Define the size of the train, dev, and test datasets

In [98]:
import math

# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
dev_lower_index   = math.floor(train_percent*max_num_ratings)
test_lower_index  = math.floor( (train_percent+dev_percent)*max_num_ratings )

train_size = dev_lower_index - train_lower_index
dev_size   = test_lower_index - dev_lower_index
test_size  = max_num_ratings - test_lower_index

Remove punctuation, lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [99]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))

Create the array of input sentences converted to word IDs. 
One extra integer to store the review ID

In [100]:
maxSeqLength = 267 # From EDA

In [101]:
word_ids = np.zeros((2*max_num_ratings, maxSeqLength+1), dtype='int32')

Convert words to word IDs and store in word_ids

In [102]:
sentence_index = 0

for df in [one_df, five_df]:
    
    for _, row in df.iterrows():

        # Store the review Id for identifying misclassified reviews in testing
        word_index = 0
        word_ids[sentence_index][word_index] = row['Id']
        word_index = word_index + 1

        for word in row['Tokens']:

            try:
                word_ids[sentence_index][word_index] = wordsList.index(word)
            except ValueError:
                word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

            word_index = word_index + 1

            if word_index == maxSeqLength:
                break

        sentence_index = sentence_index + 1


Create functions to get the train and test batches

In [103]:
from random import randint

def getBalancedReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        rating = i % 2
        
        if (rating == 0): 
            num = randint(0,sectionSize-1)
            labels.append([1, 0])
        elif (rating == 1): 
            num = randint(1*sectionSize,2*sectionSize-1)
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getRandomReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        num = randint(0, 2*sectionSize-1)
        
        if (num < sectionSize): 
            labels.append([1, 0])
        elif (num < 2*sectionSize): 
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getTrainBatch():
    return getBalancedReviews(train_lower_index, train_size)

def getDevBatch():
    return getRandomReviews(dev_lower_index, dev_size)

def getTestBatch():
    return getRandomReviews(test_lower_index, test_size)

## Tensorflow Model

Hyper Parameters

In [104]:
learning_rate =  0.001
dropout_keep_prob = 0.75

Model

In [105]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenSize)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
rnn_out, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

# Get the output of the last RNN cell
rnn_out = tf.transpose(rnn_out, [1, 0, 2])
last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

# Calculate logits
logits = (tf.matmul(last_cell_out, W_out) + b_out)

# Calculate prediction and accuracy
prediction = tf.argmax(logits,1)
correctPred = tf.equal(prediction, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Loss function and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

## Train Model

In [106]:
import datetime

def TrainModel(session, writer):
    
    start_time = datetime.datetime.now()
    
    i = 0  # Must stay outside the loops
    
    for epoch in range(500000):
        
        # Next Batch of reviews
        nextBatch, nextBatchLabels, reviewIds = getTrainBatch()

        feed_dict_ = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        loss_, _ = session.run([loss, optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 500 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds")

        i = i + 1

## Test Model

In [107]:
def TestModel(session):
    
    csv = open('Mispredicted_AmazonBinaryClassification.csv', 'w')
    csv.write("Id\n")
    
    for epoch in range(20):
    
        nextBatch, nextBatchLabels, reviewIds = getTestBatch()

        feed_dict = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        accuracy_, loss_ = sess.run([accuracy, loss], feed_dict)

        if accuracy_ < 1.0:
            #print("Accuracy for this batch: ", accuracy_ * 100)
            
            predictions_ = sess.run(prediction, feed_dict)
            
            for index in range(len(predictions_)):

                if predictions_[index] != np.argmax(nextBatchLabels[index]):
                    csv.write(str(reviewIds[index]) + "\n")


## Train and Test

Use the following:
tensorboard --logdir=tensorboard

In [108]:
import datetime

with tf.Session() as sess:
    #saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)
        
    # Train model
    TrainModel(sess, writer)
    
    # Test model
    TestModel(sess)
    
    writer.close()

Loss is:  0.72258425 ,  2  seconds
Loss is:  0.7123161 ,  32  seconds
Loss is:  0.69969684 ,  61  seconds
Loss is:  0.6710457 ,  90  seconds
Loss is:  0.6939872 ,  119  seconds
Loss is:  0.69266003 ,  148  seconds
Loss is:  0.6911041 ,  178  seconds
Loss is:  0.697511 ,  207  seconds
Loss is:  0.683413 ,  237  seconds
Loss is:  0.6926566 ,  266  seconds
Loss is:  0.6479017 ,  295  seconds
Loss is:  0.6767521 ,  324  seconds
Loss is:  0.68794566 ,  353  seconds
Loss is:  0.6932008 ,  382  seconds
Loss is:  0.6994125 ,  411  seconds
Loss is:  0.6939947 ,  440  seconds
Loss is:  0.6805227 ,  469  seconds
Loss is:  0.6710515 ,  499  seconds
Loss is:  0.68880343 ,  529  seconds
Loss is:  0.63386387 ,  558  seconds
Loss is:  0.69727534 ,  588  seconds
Loss is:  0.64367485 ,  617  seconds
Loss is:  0.67635554 ,  646  seconds
Loss is:  0.647926 ,  676  seconds
Loss is:  0.67787486 ,  705  seconds
Loss is:  0.66235536 ,  734  seconds
Loss is:  0.69567555 ,  764  seconds
Loss is:  0.6671702 ,  7