# Binary Sentiment Classification of Amazon Food Reviews

In [1]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [2]:
batchSize = 24
numClasses = 2  # Binary classification
hiddenSize = 50

assert(batchSize % numClasses == 0)

## Word Vectors

In [3]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [4]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/mprout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

Use the one and five star reviews

In [7]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index(inplace=True)

two_df = review_df[review_df.Score == 2]
two_df.reset_index(inplace=True)

four_df = review_df[review_df.Score == 4]
four_df.reset_index(inplace=True)

five_df = review_df[review_df.Score == 5]
five_df.reset_index(inplace=True)

Limit number of ratings for development

In [8]:
max_num_ratings = 12500
one_df = one_df[0:max_num_ratings]
two_df = two_df[0:max_num_ratings]
four_df = four_df[0:max_num_ratings]
five_df = five_df[0:max_num_ratings]

Define the size of the train, dev, and test datasets

In [9]:
import math

# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
dev_lower_index   = math.floor(train_percent*max_num_ratings)
test_lower_index  = math.floor( (train_percent+dev_percent)*max_num_ratings )

train_size = dev_lower_index - train_lower_index
dev_size   = test_lower_index - dev_lower_index
test_size  = max_num_ratings - test_lower_index

Remove punctuation, lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [10]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
two_df['Tokens'] = two_df['Text'].apply(lambda text: cleanSentences(text))
four_df['Tokens'] = four_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))

Create the array of input sentences converted to word IDs. 
One extra integer to store the review ID

In [11]:
maxSeqLength = 267 # From EDA

In [12]:
word_ids = np.zeros((4*max_num_ratings, maxSeqLength+1), dtype='int32')

Convert words to word IDs and store in word_ids

In [13]:
sentence_index = 0

for df in [one_df, two_df, four_df, five_df]:
    
    for _, row in df.iterrows():

        # Store the review Id for identifying misclassified reviews in testing
        word_index = 0
        word_ids[sentence_index][word_index] = row['Id']
        word_index = word_index + 1

        for word in row['Tokens']:

            try:
                word_ids[sentence_index][word_index] = wordsList.index(word)
            except ValueError:
                word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

            word_index = word_index + 1

            if word_index == maxSeqLength:
                break

        sentence_index = sentence_index + 1


Create functions to get the train and test batches

In [14]:
from random import randint

def getBalancedReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        rating = i % 4
        
        if (rating == 0): 
            num = randint(0,sectionSize-1)
            labels.append([1, 0])
        elif (rating == 1): 
            num = randint(1*sectionSize,2*sectionSize-1)
            labels.append([1, 0])
        elif (rating == 2): 
            num = randint(2*sectionSize,3*sectionSize-1)
            labels.append([0, 1])
        elif (rating == 3): 
            num = randint(3*sectionSize,4*sectionSize-1)
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getRandomReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        num = randint(0, 4*sectionSize-1)
        
        if (num < 2*sectionSize): 
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getTrainBatch():
    return getBalancedReviews(train_lower_index, train_size)

def getDevBatch():
    return getRandomReviews(dev_lower_index, dev_size)

def getTestBatch():
    return getRandomReviews(test_lower_index, test_size)

## Tensorflow Model

Hyper Parameters

In [15]:
learning_rate =  0.001
dropout_keep_prob = 0.75

Model

In [16]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenSize)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
rnn_out, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

# Get the output of the last RNN cell
rnn_out = tf.transpose(rnn_out, [1, 0, 2])
last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

# Calculate logits
logits = (tf.matmul(last_cell_out, W_out) + b_out)

# Calculate prediction and accuracy
prediction = tf.argmax(logits,1)
correctPred = tf.equal(prediction, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Loss function and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



## Train Model

In [17]:
import datetime

def TrainModel(session, writer):
    
    start_time = datetime.datetime.now()
    
    i = 0  # Must stay outside the loops
    
    for epoch in range(260000):
        
        # Next Batch of reviews
        nextBatch, nextBatchLabels, reviewIds = getTrainBatch()

        feed_dict_ = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        loss_, _ = session.run([loss, optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 1000 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds")

        i = i + 1

## Test Model

In [18]:
def TestModel(session):
    
    csv = open('Mispredicted_AmazonBinaryClassification.csv', 'w')
    csv.write("Id\n")
    
    accuracy_measurements = []
    loss_measurements = []
    
    for epoch in range(20):
    
        nextBatch, nextBatchLabels, reviewIds = getTestBatch()

        feed_dict = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        accuracy_, loss_ = sess.run([accuracy, loss], feed_dict)
        
        accuracy_measurements.append(accuracy_)
        loss_measurements.append(loss_)

        if accuracy_ < 1.0:

            predictions_ = sess.run(prediction, feed_dict)
            
            for index in range(len(predictions_)):

                if predictions_[index] != np.argmax(nextBatchLabels[index]):
                    csv.write(str(int(reviewIds[index])) + "\n")
                    
    print('Testing Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

## Train and Test

Use the following:
tensorboard --logdir=tensorboard

In [19]:
import datetime

with tf.Session() as sess:
    #saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)
        
    # Train model
    TrainModel(sess, writer)
    
    # Test model
    TestModel(sess)
    
    writer.close()

Loss is:  0.691156 ,  1  seconds
Loss is:  0.693606 ,  226  seconds
Loss is:  0.660645 ,  450  seconds
Loss is:  0.688019 ,  673  seconds
Loss is:  0.684117 ,  804  seconds
Loss is:  0.706919 ,  920  seconds
Loss is:  0.674572 ,  1035  seconds
Loss is:  0.695485 ,  1151  seconds
Loss is:  0.695464 ,  1267  seconds
Loss is:  0.698458 ,  1382  seconds
Loss is:  0.683549 ,  1497  seconds
Loss is:  0.691485 ,  1612  seconds
Loss is:  0.67309 ,  1729  seconds
Loss is:  0.72179 ,  1845  seconds
Loss is:  0.750071 ,  1961  seconds
Loss is:  0.704254 ,  2076  seconds
Loss is:  0.706589 ,  2192  seconds
Loss is:  0.655658 ,  2308  seconds
Loss is:  0.687527 ,  2425  seconds
Loss is:  0.664242 ,  2541  seconds
Loss is:  0.604972 ,  2657  seconds
Loss is:  0.655624 ,  2773  seconds
Loss is:  0.661902 ,  2889  seconds
Loss is:  0.690561 ,  3005  seconds
Loss is:  0.651645 ,  3121  seconds
Loss is:  0.620551 ,  3236  seconds
Loss is:  0.670532 ,  3352  seconds
Loss is:  0.642177 ,  3468  seconds
Lo

Loss is:  0.397257 ,  26396  seconds
Loss is:  0.168147 ,  26512  seconds
Loss is:  0.0679485 ,  26628  seconds
Loss is:  0.129056 ,  26745  seconds
Loss is:  0.343421 ,  26861  seconds
Loss is:  0.352545 ,  26977  seconds
Loss is:  0.45172 ,  27093  seconds
Loss is:  0.199107 ,  27210  seconds
Loss is:  0.581389 ,  27326  seconds
Loss is:  0.273102 ,  27442  seconds
Loss is:  0.257325 ,  27559  seconds
Loss is:  0.409929 ,  27675  seconds
Loss is:  0.241559 ,  27792  seconds
Loss is:  0.212319 ,  27908  seconds
Loss is:  0.203805 ,  28024  seconds
Loss is:  0.151152 ,  28140  seconds
Loss is:  0.241127 ,  28256  seconds
Loss is:  0.619059 ,  28372  seconds
Loss is:  0.299013 ,  28488  seconds
Loss is:  0.194496 ,  28605  seconds
Loss is:  0.226682 ,  28721  seconds
Loss is:  0.394552 ,  28838  seconds
Loss is:  0.262626 ,  28955  seconds
Loss is:  0.367318 ,  29072  seconds
Loss is:  0.327194 ,  29189  seconds
Loss is:  0.175168 ,  29305  seconds
Loss is:  0.479147 ,  29421  seconds
L