# Binary Sentiment Classification of Amazon Food Reviews

In [1]:
import numpy as np
import pandas as pd

Paths to serialization files

In [2]:
# Matt's paths
# pathToBinClassDir = '/home/matt/w266_saved/binaryclassifier'
# pathToWordId      = '/home/matt/w266_saved/binaryclassifier/wordId.npy'
# pathToCheckpoint  = '/home/matt/w266_saved/binaryclassifier/model.ckpt'

# Dave's paths
pathToBinClassDir = '/home/dal7p/project_model/'
pathToWordId      = '/home/dal7p/project_model/wordId.npy'
pathToCheckpoint  = '/home/dal7p/project_model/model.ckpt'

RNN Model Parameters

In [3]:
batchSize = 24
numClasses = 2  # Binary classification
hiddenSize = 50

assert(batchSize % numClasses == 0)

## Word Vectors

In [4]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [5]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/dal7p/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

Use the one and five star reviews

In [8]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index(inplace=True)

two_df = review_df[review_df.Score == 2]
two_df.reset_index(inplace=True)

four_df = review_df[review_df.Score == 4]
four_df.reset_index(inplace=True)

five_df = review_df[review_df.Score == 5]
five_df.reset_index(inplace=True)

Limit number of ratings for development

In [9]:
max_num_ratings = 116000
rating_level_length = 29000
assert(max_num_ratings <= 4*rating_level_length)

one_df = one_df[0:rating_level_length]
two_df = two_df[0:rating_level_length]
four_df = four_df[0:rating_level_length]
five_df = five_df[0:rating_level_length]

Define the size of the train, dev, and test datasets

In [10]:
import math

# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
dev_lower_index   = math.floor(train_percent*max_num_ratings)
test_lower_index  = math.floor( (train_percent+dev_percent)*max_num_ratings )

train_size = dev_lower_index - train_lower_index
dev_size   = test_lower_index - dev_lower_index
test_size  = max_num_ratings - test_lower_index

Remove punctuation, lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [11]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
two_df['Tokens'] = two_df['Text'].apply(lambda text: cleanSentences(text))
four_df['Tokens'] = four_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))

Create the array of input sentences converted to word IDs. 
One extra integer to store the review ID

In [12]:
maxSeqLength = 267 # From EDA

In [13]:
word_ids = np.zeros((4*rating_level_length, maxSeqLength+2), dtype='int32')

Convert words to word IDs and store in word_ids

In [14]:
from pathlib import Path
word_id_file = Path(pathToWordId)

if not word_id_file.exists():

    sentence_index = 0

    for df in [one_df, two_df, four_df, five_df]:

        for _, row in df.iterrows():

            # Store the review Id for identifying misclassified reviews in testing
            word_index = 0
            word_ids[sentence_index][word_index] = row['Id']
            word_index = word_index + 1
            word_ids[sentence_index][word_index] = row['Score']
            word_index = word_index + 1

            for word in row['Tokens']:

                try:
                    word_ids[sentence_index][word_index] = wordsList.index(word)
                except ValueError:
                    word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

                word_index = word_index + 1

                if word_index == maxSeqLength:
                    break

            sentence_index = sentence_index + 1

    # Shuffle the word_ids matrix
    np.random.shuffle(word_ids)
    
    # Save the word_ids matrix
    binClassDir = Path(pathToBinClassDir)
    
    if not binClassDir.exists():
        os.mkdir(pathToBinClassDir)
    
    np.save(pathToWordId, word_ids)
else:
    word_ids = np.load(pathToWordId)

Create functions to get the train and test batches

In [15]:
reviewsTrainIndex = 0
reviewsDevIndex = 0
reviewsTestIndex = 0

def getOrderedTrainReviews(sectionOffset, sectionSize):
    global reviewsTrainIndex
    
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    finished = False
    
    for i in range(batchSize):
        
        if reviewsTrainIndex >= sectionSize:
            finished = True
            break;

        num = reviewsTrainIndex + sectionOffset
        
        if (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        reviewsTrainIndex += 1
        
    if reviewsTrainIndex == sectionSize:
        finished = True
        
    return arr, labels, ids, finished

def getOrderedDevReviews(sectionOffset, sectionSize):
    global reviewsDevIndex
    
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    finished = False
    
    for i in range(batchSize):
        
        if reviewsDevIndex >= sectionSize:
            finished = True
            break;

        num = reviewsDevIndex + sectionOffset
        
        if (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        reviewsDevIndex += 1
        
    if reviewsDevIndex == sectionSize:
        finished = True
        
    return arr, labels, ids, finished

def getOrderedTestReviews(sectionOffset, sectionSize):
    global reviewsTestIndex
    
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    finished = False
    
    for i in range(batchSize):
        
        if reviewsTestIndex >= sectionSize:
            finished = True
            break;

        num = reviewsTestIndex + sectionOffset
        
        if (word_ids[num, 1] == 1) or (word_ids[num, 1] == 2):
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        arr[i] = word_ids[num, 2:]
        ids[i] = word_ids[num, 0]
        reviewsTestIndex += 1
        
    if reviewsTestIndex == sectionSize:
        finished = True
        
    return arr, labels, ids, finished

def resetTrainIndex():
    global reviewsTrainIndex
    reviewsTrainIndex = 0

def resetDevTestIndicies():
    global reviewsDevIndex
    global reviewsTestIndex

    reviewsDevIndex = 0
    reviewsTestIndex = 0

def getTrainBatch():
    return getOrderedTrainReviews(train_lower_index, train_size)

def getDevBatch():
    return getOrderedDevReviews(dev_lower_index, dev_size)

def getTestBatch():
    return getOrderedTestReviews(test_lower_index, test_size)

## Tensorflow Model

Hyper Parameters

In [16]:
epochs = 100
learning_rate =  0.001
dropout_keep_prob = 0.75
num_layers = 2

Model

In [17]:
import tensorflow as tf

class SentimentGraph:
    
    def __init__(self):
        self.labels = tf.placeholder(tf.float32, [batchSize, numClasses])
        self.input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
        self.prediction = None
        self.accuracy = None
        self.loss = None
        self.optimizer = None

    def MakeFancyRNNCell(self, H, keep_prob, num_layers=1):
        cells = []
        for _ in range(num_layers):
          cell = tf.nn.rnn_cell.BasicLSTMCell(H, forget_bias=0.0)
          cell = tf.nn.rnn_cell.DropoutWrapper(
              cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
          cells.append(cell)
        return tf.nn.rnn_cell.MultiRNNCell(cells)

    def CreateGraph(self):
        data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
        data = tf.nn.embedding_lookup(wordVectors, self.input_data)

        lstmCell = self.MakeFancyRNNCell(hiddenSize, dropout_keep_prob, num_layers)
        initial_h_ = lstmCell.zero_state(batchSize, dtype=tf.float32)
        
        rnn_out, _ = tf.nn.dynamic_rnn(lstmCell,
                                       data,
                                       initial_state=initial_h_,
                                       dtype=tf.float32)

        W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
        b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

        # Get the output of the last RNN cell
        rnn_out = tf.transpose(rnn_out, [1, 0, 2])
        last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

        # Calculate logits
        logits = (tf.matmul(last_cell_out, W_out) + b_out)

        # Calculate prediction and accuracy
        self.prediction = tf.argmax(logits,1)
        correctPred = tf.equal(self.prediction, tf.argmax(self.labels,1))
        self.accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

        # Loss function and optimizer
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.labels))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

  from ._conv import register_converters as _register_converters


## Train Model

In [None]:
import datetime

def TrainModel(session, logdir, graph):
    
    # Open the writer
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    tf.summary.scalar('Train_Loss', graph.loss)
    tf.summary.scalar('Train_Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
    start_time = datetime.datetime.now()
    i = 0  # Must stay outside the loops
    
    for epoch in range(epochs):
        
        print("Epoch: ", epoch)
        
        resetTrainIndex()
        finished = False
        
        while not finished:
            
            # Next Batch of reviews
            nextBatch, nextBatchLabels, reviewIds, finished = getTrainBatch()

            # For ease of implementation, just skip partially filled batches
            if not finished:

                feed_dict_ = {
                    graph.input_data: nextBatch,
                    graph.labels: nextBatchLabels
                }

                loss_, _ = session.run([graph.loss, graph.optimizer], feed_dict=feed_dict_)

                # Write summary to Tensorboard
                if (i % 10 == 0):
                    summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
                    writer.add_summary(summary, i)

                if (i % 1000 == 0):
                    print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds, iteration: ", i)

                i += 1
    
    # Close the writer
    writer.close()

## Validate Model

Examine the Mispredicted_AmazonBinaryClassification.csv file for error analysis.

In [None]:
def TestModelDev(session, logdir, graph):

    # Support for saving mispredicted reviews
    csv = open('Mispredicted_AmazonBinaryClassification.csv', 'w')
    csv.write("Id\n")
    
    # Tensorboard support
    writer = tf.summary.FileWriter(logdir, sess.graph)
    tf.summary.scalar('Dev_Loss', graph.loss)
    tf.summary.scalar('Dev_Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
    start_time = datetime.datetime.now()
    i = 0  # Must stay outside the loops
    
    accuracy_measurements = []
    loss_measurements = []
    finished = False
    
    while not finished:
    
        nextBatch, nextBatchLabels, reviewIds, finished = getDevBatch()

        # For ease of implementation, just skip partially filled batches
        if not finished:
            feed_dict = {
                graph.input_data: nextBatch,
                graph.labels: nextBatchLabels
            }

            accuracy_, loss_ = sess.run([graph.accuracy, graph.loss], feed_dict)
            
            # Write summary to Tensorboard
            if (i % 10 == 0):
                summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
                writer.add_summary(summary, i)

            i += 1

            accuracy_measurements.append(accuracy_)
            loss_measurements.append(loss_)
            
            # Write out mispredictions (review IDs) to a .csv file
            if accuracy_ < 1.0:
                
                predictions_ = sess.run(graph.prediction, feed_dict)

                for index in range(len(predictions_)):

                    if predictions_[index] != np.argmax(nextBatchLabels[index]):
                        csv.write(str(int(reviewIds[index])) + "\n")

    # Close the writer
    writer.close()
    
    print('Testing (Dev) Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

## Test Model

In [18]:
def TestModelTest(session, graph):
    
    i = 0  # Must stay outside the loops
    
    accuracy_measurements = []
    loss_measurements = []
    finished = False
    
    while not finished:
    
        nextBatch, nextBatchLabels, reviewIds, finished = getTestBatch()

        # For ease of implementation, just skip partially filled batches
        if not finished:
            feed_dict = {
                graph.input_data: nextBatch,
                graph.labels: nextBatchLabels
            }

            accuracy_, loss_ = sess.run([graph.accuracy, graph.loss], feed_dict)

            i += 1

            accuracy_measurements.append(accuracy_)
            loss_measurements.append(loss_)
              
    print('Testing (Test) Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

### Run prior to training or testing

In [20]:
import datetime

logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

Use the following:
tensorboard --logdir=tensorboard

## Train

In [None]:
import datetime

tf.reset_default_graph()
    
with tf.Session() as sess:
        
    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()

    # Initialize the graph
    sess.run(tf.global_variables_initializer())
    
    # Train model
    TrainModel(sess, logdir, sentimentGraph)
    
    # Save the model variables
    saver = tf.train.Saver()
    saver.save(sess, pathToCheckpoint)

### Run prior to validation and testing

In [21]:
resetDevTestIndicies()

## Validation

In [None]:
tf.reset_default_graph()
    
with tf.Session() as sess:

    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()
    
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess, pathToCheckpoint)

    # Validate model
    TestModelDev(sess, logdir, sentimentGraph)

## Test

In [22]:
tf.reset_default_graph()
    
with tf.Session() as sess:

    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()
    
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess, pathToCheckpoint)

    # Test model
    TestModelTest(sess, sentimentGraph)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

INFO:tensorflow:Restoring parameters from /home/dal7p/project_model/model.ckpt
Testing (Test) Results:
The average accuracy is:  0.8760352
The average loss is:  0.2954482
