# Binary Sentiment Classification of Amazon Food Reviews

In [28]:
import numpy as np
import pandas as pd

Paths to serialization files

In [29]:
pathToBinClassDir = '/home/matt/w266_saved/binaryclassifier'
pathToWordId      = '/home/matt/w266_saved/binaryclassifier/wordId.npy'
pathToCheckpoint  = '/home/matt/w266_saved/binaryclassifier/model.ckpt'

RNN Model Parameters

In [30]:
batchSize = 24
numClasses = 2  # Binary classification
hiddenSize = 50

assert(batchSize % numClasses == 0)

## Word Vectors

In [31]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [32]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [33]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time'], axis=1)

Use the one and five star reviews

In [35]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index(inplace=True)

two_df = review_df[review_df.Score == 2]
two_df.reset_index(inplace=True)

four_df = review_df[review_df.Score == 4]
four_df.reset_index(inplace=True)

five_df = review_df[review_df.Score == 5]
five_df.reset_index(inplace=True)

Limit number of ratings for development

In [36]:
max_num_ratings = 116000
rating_level_length = 29000
assert(max_num_ratings <= 4*rating_level_length)

one_df = one_df[0:rating_level_length]
two_df = two_df[0:rating_level_length]
four_df = four_df[0:rating_level_length]
five_df = five_df[0:rating_level_length]

Define the size of the train, dev, and test datasets

In [37]:
import math

# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
dev_lower_index   = math.floor(train_percent*max_num_ratings)
test_lower_index  = math.floor( (train_percent+dev_percent)*max_num_ratings )

train_size = dev_lower_index - train_lower_index
dev_size   = test_lower_index - dev_lower_index
test_size  = max_num_ratings - test_lower_index

Remove punctuation, lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [38]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

def custom_tokenize(string):
    if not string:
        string = ''
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
one_df['Summary_Tokens'] = one_df['Summary'].fillna("").apply(custom_tokenize)
two_df['Tokens'] = two_df['Text'].apply(lambda text: cleanSentences(text))
two_df['Summary_Tokens'] = two_df['Summary'].fillna("").apply(custom_tokenize)
four_df['Tokens'] = four_df['Text'].apply(lambda text: cleanSentences(text))
four_df['Summary_Tokens'] = four_df['Summary'].fillna("").apply(custom_tokenize)
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Summary_Tokens'] = five_df['Summary'].fillna("").apply(custom_tokenize)

Create the array of input sentences converted to word IDs. 
One extra integer to store the review ID

In [39]:
maxSummaryLength = 11  # From EDA
maxSeqLength     = maxSummaryLength + 267 # From EDA

In [40]:
word_ids = np.zeros((4*rating_level_length, maxSeqLength+2), dtype='int32')

Convert words to word IDs and store in word_ids

In [41]:
from pathlib import Path
word_id_file = Path(pathToWordId)

if not word_id_file.exists():

    sentence_index = 0

    for df in [one_df, two_df, four_df, five_df]:

        for _, row in df.iterrows():

            # Store the review Id for identifying misclassified reviews in testing
            word_index = 0
            word_ids[sentence_index][word_index] = row['Id']
            word_index = word_index + 1
            word_ids[sentence_index][word_index] = row['Score']
            word_index = word_index + 1

            for word in row['Summary_Tokens']:

                try:
                    word_ids[sentence_index][word_index] = wordsList.index(word)
                except ValueError:
                    word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

                word_index = word_index + 1

                if word_index == maxSummaryLength:
                    break
                    
            for word in row['Tokens']:

                try:
                    word_ids[sentence_index][word_index] = wordsList.index(word)
                except ValueError:
                    word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

                word_index = word_index + 1

                if word_index == maxSeqLength:
                    break

            sentence_index = sentence_index + 1

    # Shuffle the word_ids matrix
    np.random.shuffle(word_ids)
    
    # Save the word_ids matrix
    binClassDir = Path(pathToBinClassDir)
    
    if not binClassDir.exists():
        os.mkdir(pathToBinClassDir)
    
    np.save(pathToWordId, word_ids)
else:
    word_ids = np.load(pathToWordId)

Create functions to get the train and test batches

In [42]:
from random import randint

def getRandomReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        num = randint(0, sectionSize-1) + sectionOffset
        
        if ( (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2)  ):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids


reviewsDevIndex = 0
reviewsTestIndex = 0

def getOrderedDevReviews(sectionOffset, sectionSize):
    global reviewsDevIndex
    
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    finished = False
    
    for i in range(batchSize):
        
        if reviewsDevIndex >= sectionSize:
            finished = True
            break;

        num = reviewsDevIndex + sectionOffset
        
        if (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        reviewsDevIndex += 1
        
    if reviewsDevIndex == sectionSize:
        finished = True
        
    return arr, labels, ids, finished

def getOrderedTestReviews(sectionOffset, sectionSize):
    global reviewsTestIndex
    
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    finished = False
    
    for i in range(batchSize):
        
        if reviewsTestIndex >= sectionSize:
            finished = True
            break;

        num = reviewsTestIndex + sectionOffset
        
        if (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        reviewsTestIndex += 1
        
    if reviewsTestIndex == sectionSize:
        finished = True
        
    return arr, labels, ids, finished

def resetDevTestIndicies():
    global reviewsDevIndex
    global reviewsTestIndex

    reviewsDevIndex = 0
    reviewsTestIndex = 0

def getTrainBatch():
    return getRandomReviews(train_lower_index, train_size)

def getDevBatch():
    return getOrderedDevReviews(dev_lower_index, dev_size)

def getTestBatch():
    return getOrderedTestReviews(test_lower_index, test_size)

## Tensorflow Model

Hyper Parameters

In [43]:
train_iterations = 200000
learning_rate =  0.001
dropout_keep_prob = 0.75
num_layers = 2

Model

In [44]:
import tensorflow as tf

class SentimentGraph:
    
    def __init__(self):
        self.labels = tf.placeholder(tf.float32, [batchSize, numClasses])
        self.input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
        self.prediction = None
        self.accuracy = None
        self.loss = None
        self.optimizer = None

    def MakeFancyRNNCell(self, H, keep_prob, num_layers=1):
        cells = []
        for _ in range(num_layers):
          cell = tf.nn.rnn_cell.BasicLSTMCell(H, forget_bias=0.0)
          cell = tf.nn.rnn_cell.DropoutWrapper(
              cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
          cells.append(cell)
        return tf.nn.rnn_cell.MultiRNNCell(cells)

    def CreateGraph(self):
        data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
        data = tf.nn.embedding_lookup(wordVectors, self.input_data)

        lstmCell = self.MakeFancyRNNCell(hiddenSize, dropout_keep_prob, num_layers)
        initial_h_ = lstmCell.zero_state(batchSize, dtype=tf.float32)
        
        rnn_out, _ = tf.nn.dynamic_rnn(lstmCell,
                                       data,
                                       initial_state=initial_h_,
                                       dtype=tf.float32)

        W_out = tf.Variable(tf.random_uniform([hiddenSize, numClasses], minval=-1.0, maxval=1.0, dtype=tf.float32), dtype=tf.float32)
        b_out = tf.Variable(tf.zeros([numClasses,], dtype=tf.float32), dtype=tf.float32)

        # Get the output of the last RNN cell
        rnn_out = tf.transpose(rnn_out, [1, 0, 2])
        last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

        # Calculate logits
        logits = (tf.matmul(last_cell_out, W_out) + b_out)

        # Calculate prediction and accuracy
        self.prediction = tf.argmax(logits,1)
        correctPred = tf.equal(self.prediction, tf.argmax(self.labels,1))
        self.accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

        # Loss function and optimizer
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.labels))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

## Train Model

In [45]:
import datetime

def TrainModel(session, logdir, graph):
    
    # Open the writer
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    tf.summary.scalar('Train_Loss', graph.loss)
    tf.summary.scalar('Train_Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
        
    start_time = datetime.datetime.now()
    
    i = 0  # Must stay outside the loops
    
    for epoch in range(train_iterations):
        
        # Next Batch of reviews
        nextBatch, nextBatchLabels, reviewIds = getTrainBatch()

        feed_dict_ = {
            graph.input_data: nextBatch,
            graph.labels: nextBatchLabels
        }

        loss_, _ = session.run([graph.loss, graph.optimizer], feed_dict=feed_dict_)

        # Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 1000 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds, iteration: ", i)

        i += 1
    
    # Close the writer
    writer.close()

## Validate Model

Examine the Mispredicted_AmazonBinaryClassification.csv file for error analysis.

In [46]:
def TestModelDev(session, logdir, graph):

    # Support for saving mispredicted reviews
    csv = open('Mispredicted_AmazonBinaryClassification.csv', 'w')
    csv.write("Id\n")
    
    # Tensorboard support
    writer = tf.summary.FileWriter(logdir, sess.graph)
    tf.summary.scalar('Dev_Loss', graph.loss)
    tf.summary.scalar('Dev_Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
    start_time = datetime.datetime.now()
    i = 0  # Must stay outside the loops
    
    accuracy_measurements = []
    loss_measurements = []
    finished = False
    
    while not finished:
    
        nextBatch, nextBatchLabels, reviewIds, finished = getDevBatch()

        # For ease of implementation, just skip partially filled batches
        if not finished:
            feed_dict = {
                graph.input_data: nextBatch,
                graph.labels: nextBatchLabels
            }

            accuracy_, loss_ = sess.run([graph.accuracy, graph.loss], feed_dict)
            
            # Write summary to Tensorboard
            if (i % 10 == 0):
                summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
                writer.add_summary(summary, i)

            i += 1

            accuracy_measurements.append(accuracy_)
            loss_measurements.append(loss_)
            
            # Write out mispredictions (review IDs) to a .csv file
            if accuracy_ < 1.0:
                
                predictions_ = sess.run(graph.prediction, feed_dict)

                for index in range(len(predictions_)):

                    if predictions_[index] != np.argmax(nextBatchLabels[index]):
                        csv.write(str(int(reviewIds[index])) + "\n")

    # Close the writer
    writer.close()
    
    print('Testing (Dev) Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

## Test Model

In [47]:
def TestModelTest(session, logdir, graph):
    
    # Tensorboard support
    writer = tf.summary.FileWriter(logdir, sess.graph)
    tf.summary.scalar('Test_Loss', graph.loss)
    tf.summary.scalar('Test_Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
    start_time = datetime.datetime.now()
    i = 0  # Must stay outside the loops
    
    accuracy_measurements = []
    loss_measurements = []
    finished = False
    
    while not finished:
    
        nextBatch, nextBatchLabels, reviewIds, finished = getTestBatch()

        # For ease of implementation, just skip partially filled batches
        if not finished:
            feed_dict = {
                graph.input_data: nextBatch,
                graph.labels: nextBatchLabels
            }

            accuracy_, loss_ = sess.run([graph.accuracy, graph.loss], feed_dict)

            # Write summary to Tensorboard
            if (i % 10 == 0):
                summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
                writer.add_summary(summary, i)

            i += 1

            accuracy_measurements.append(accuracy_)
            loss_measurements.append(loss_)
                    
    # Close the writer
    writer.close()
    
    print('Testing (Test) Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

### Run prior to training or testing

In [48]:
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

Use the following:
tensorboard --logdir=tensorboard

## Train

In [49]:
import datetime

tf.reset_default_graph()
    
with tf.Session() as sess:
        
    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()

    # Initialize the graph
    sess.run(tf.global_variables_initializer())
    
    # Train model
    TrainModel(sess, logdir, sentimentGraph)
    
    # Save the model variables
    saver = tf.train.Saver()
    saver.save(sess, pathToCheckpoint)

Loss is:  0.6931472 ,  0  seconds, iteration:  0
Loss is:  0.6828273 ,  94  seconds, iteration:  1000
Loss is:  0.702293 ,  178  seconds, iteration:  2000
Loss is:  0.71521074 ,  245  seconds, iteration:  3000
Loss is:  0.69490576 ,  313  seconds, iteration:  4000
Loss is:  0.67615104 ,  380  seconds, iteration:  5000
Loss is:  0.68719053 ,  448  seconds, iteration:  6000
Loss is:  0.6983556 ,  515  seconds, iteration:  7000
Loss is:  0.6970288 ,  583  seconds, iteration:  8000
Loss is:  0.6914541 ,  651  seconds, iteration:  9000
Loss is:  0.6644372 ,  718  seconds, iteration:  10000
Loss is:  0.6964162 ,  786  seconds, iteration:  11000
Loss is:  0.6950415 ,  853  seconds, iteration:  12000
Loss is:  0.69777656 ,  921  seconds, iteration:  13000
Loss is:  0.6818164 ,  988  seconds, iteration:  14000
Loss is:  0.6848648 ,  1056  seconds, iteration:  15000
Loss is:  0.6888134 ,  1123  seconds, iteration:  16000
Loss is:  0.7102415 ,  1190  seconds, iteration:  17000
Loss is:  0.6923963

Loss is:  0.23297672 ,  11523  seconds, iteration:  145000
Loss is:  0.14541161 ,  11619  seconds, iteration:  146000
Loss is:  0.2659092 ,  11713  seconds, iteration:  147000
Loss is:  0.14263712 ,  11809  seconds, iteration:  148000
Loss is:  0.22243015 ,  11904  seconds, iteration:  149000
Loss is:  0.105272435 ,  12001  seconds, iteration:  150000
Loss is:  0.13330153 ,  12097  seconds, iteration:  151000
Loss is:  0.26485875 ,  12194  seconds, iteration:  152000
Loss is:  0.19790058 ,  12290  seconds, iteration:  153000
Loss is:  0.23897326 ,  12386  seconds, iteration:  154000
Loss is:  0.26053932 ,  12482  seconds, iteration:  155000
Loss is:  0.3214817 ,  12578  seconds, iteration:  156000
Loss is:  0.11014327 ,  12674  seconds, iteration:  157000
Loss is:  0.14146888 ,  12769  seconds, iteration:  158000
Loss is:  0.047383357 ,  12864  seconds, iteration:  159000
Loss is:  0.2725958 ,  12959  seconds, iteration:  160000
Loss is:  0.07970559 ,  13055  seconds, iteration:  16100

Loss is:  0.14576487 ,  24945  seconds, iteration:  285000
Loss is:  0.14820077 ,  25041  seconds, iteration:  286000
Loss is:  0.16473836 ,  25137  seconds, iteration:  287000
Loss is:  0.17299946 ,  25233  seconds, iteration:  288000
Loss is:  0.14207996 ,  25329  seconds, iteration:  289000
Loss is:  0.29819313 ,  25425  seconds, iteration:  290000
Loss is:  0.09078858 ,  25521  seconds, iteration:  291000
Loss is:  0.03545466 ,  25617  seconds, iteration:  292000
Loss is:  0.46477103 ,  25711  seconds, iteration:  293000
Loss is:  0.26849577 ,  25806  seconds, iteration:  294000
Loss is:  0.1948148 ,  25902  seconds, iteration:  295000
Loss is:  0.06398646 ,  25997  seconds, iteration:  296000
Loss is:  0.37778068 ,  26093  seconds, iteration:  297000
Loss is:  0.15315269 ,  26188  seconds, iteration:  298000
Loss is:  0.121391654 ,  26284  seconds, iteration:  299000
Loss is:  0.117205776 ,  26380  seconds, iteration:  300000
Loss is:  0.318795 ,  26476  seconds, iteration:  30100

### Run prior to validation and testing

In [50]:
resetDevTestIndicies()

## Validation

In [51]:
tf.reset_default_graph()
    
with tf.Session() as sess:

    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()
    
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess, pathToCheckpoint)

    # Validate model
    TestModelDev(sess, logdir, sentimentGraph)

INFO:tensorflow:Restoring parameters from /home/matt/w266_saved/binaryclassifier/model.ckpt
Testing (Dev) Results:
The average accuracy is:  0.9109731
The average loss is:  0.22690444


## Test

In [52]:
tf.reset_default_graph()
    
with tf.Session() as sess:

    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()
    
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess, pathToCheckpoint)

    # Test model
    TestModelTest(sess, logdir, sentimentGraph)

INFO:tensorflow:Restoring parameters from /home/matt/w266_saved/binaryclassifier/model.ckpt
Testing (Test) Results:
The average accuracy is:  0.9100816
The average loss is:  0.23001097
