# Binary Sentiment Classification of Amazon Food Reviews

In [1]:
import numpy as np
import pandas as pd

Paths to serialization files

In [2]:
pathToBinClassDir = '/tmp/binaryclassifier'
pathToWordId      = '/tmp/binaryclassifier/wordId.npy'
pathToCheckpoint  = '/tmp/binaryclassifier/model.ckpt'

RNN Model Parameters

In [3]:
batchSize = 24
numClasses = 2  # Binary classification
hiddenSize = 50

assert(batchSize % numClasses == 0)

## Word Vectors

In [4]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [5]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

Use the one and five star reviews

In [8]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index(inplace=True)

two_df = review_df[review_df.Score == 2]
two_df.reset_index(inplace=True)

four_df = review_df[review_df.Score == 4]
four_df.reset_index(inplace=True)

five_df = review_df[review_df.Score == 5]
five_df.reset_index(inplace=True)

Limit number of ratings for development

In [9]:
max_num_ratings = 12500
one_df = one_df[0:max_num_ratings]
two_df = two_df[0:max_num_ratings]
four_df = four_df[0:max_num_ratings]
five_df = five_df[0:max_num_ratings]

Define the size of the train, dev, and test datasets

In [10]:
import math

# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
dev_lower_index   = math.floor(train_percent*max_num_ratings)
test_lower_index  = math.floor( (train_percent+dev_percent)*max_num_ratings )

train_size = dev_lower_index - train_lower_index
dev_size   = test_lower_index - dev_lower_index
test_size  = max_num_ratings - test_lower_index

Remove punctuation, lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [11]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
two_df['Tokens'] = two_df['Text'].apply(lambda text: cleanSentences(text))
four_df['Tokens'] = four_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))

Create the array of input sentences converted to word IDs. 
One extra integer to store the review ID

In [12]:
maxSeqLength = 267 # From EDA

In [13]:
word_ids = np.zeros((4*max_num_ratings, maxSeqLength+1), dtype='int32')

Convert words to word IDs and store in word_ids

In [14]:
from pathlib import Path
word_id_file = Path(pathToWordId)

if not word_id_file.exists():

    sentence_index = 0

    for df in [one_df, two_df, four_df, five_df]:

        for _, row in df.iterrows():

            # Store the review Id for identifying misclassified reviews in testing
            word_index = 0
            word_ids[sentence_index][word_index] = row['Id']
            word_index = word_index + 1

            for word in row['Tokens']:

                try:
                    word_ids[sentence_index][word_index] = wordsList.index(word)
                except ValueError:
                    word_ids[sentence_index][word_index] = 399999 #Vector for unkown words

                word_index = word_index + 1

                if word_index == maxSeqLength:
                    break

            sentence_index = sentence_index + 1

    binClassDir = Path(pathToBinClassDir)
    
    if not binClassDir.exists():
        os.mkdir(pathToBinClassDir)
    
    np.save(pathToWordId, word_ids)
else:
    word_ids = np.load(pathToWordId)

Create functions to get the train and test batches

In [15]:
from random import randint

def getBalancedReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        rating = i % 4
        
        if (rating == 0): 
            num = randint(0,sectionSize-1)
            labels.append([1, 0])
        elif (rating == 1): 
            num = randint(1*sectionSize,2*sectionSize-1)
            labels.append([1, 0])
        elif (rating == 2): 
            num = randint(2*sectionSize,3*sectionSize-1)
            labels.append([0, 1])
        elif (rating == 3): 
            num = randint(3*sectionSize,4*sectionSize-1)
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getRandomReviews(sectionOffset, sectionSize):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    ids = np.zeros(batchSize)
    
    for i in range(batchSize):
        
        num = randint(0, 4*sectionSize-1)
        
        if (num < 2*sectionSize): 
            labels.append([1, 0])
        else: 
            labels.append([0, 1])
        
        num = num + sectionOffset
        arr[i] = word_ids[num, 1:]
        ids[i] = word_ids[num, 0]
        
    return arr, labels, ids

def getTrainBatch():
    return getBalancedReviews(train_lower_index, train_size)

def getDevBatch():
    return getRandomReviews(dev_lower_index, dev_size)

def getTestBatch():
    return getRandomReviews(test_lower_index, test_size)

## Tensorflow Model

Hyper Parameters

In [16]:
learning_rate =  0.001
dropout_keep_prob = 0.75

Model

In [17]:
import tensorflow as tf

class SentimentGraph:
    
    def __init__(self):
        self.labels = tf.placeholder(tf.float32, [batchSize, numClasses])
        self.input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
        self.prediction = None
        self.accuracy = None
        self.loss = None
        self.optimizer = None

    def CreateGraph(self):
        data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
        data = tf.nn.embedding_lookup(wordVectors, self.input_data)

        lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenSize)
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
        rnn_out, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

        W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
        b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

        # Get the output of the last RNN cell
        rnn_out = tf.transpose(rnn_out, [1, 0, 2])
        last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

        # Calculate logits
        logits = (tf.matmul(last_cell_out, W_out) + b_out)

        # Calculate prediction and accuracy
        self.prediction = tf.argmax(logits,1)
        correctPred = tf.equal(self.prediction, tf.argmax(self.labels,1))
        self.accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

        # Loss function and optimizer
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.labels))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

## Train Model

In [18]:
import datetime

def TrainModel(session, graph):
    
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    
    # Open the writer
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    tf.summary.scalar('Loss', graph.loss)
    tf.summary.scalar('Accuracy', graph.accuracy)
    merged = tf.summary.merge_all()
        
    start_time = datetime.datetime.now()
    
    i = 0  # Must stay outside the loops
    
    for epoch in range(260000):
        
        # Next Batch of reviews
        nextBatch, nextBatchLabels, reviewIds = getTrainBatch()

        feed_dict_ = {
            graph.input_data: nextBatch,
            graph.labels: nextBatchLabels
        }

        loss_, _ = session.run([graph.loss, graph.optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {graph.input_data: nextBatch, graph.labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 1000 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds")

        i = i + 1
    
    # Close the writer
    writer.close()

## Test Model

In [19]:
def TestModel(session, graph):
    
    csv = open('Mispredicted_AmazonBinaryClassification.csv', 'w')
    csv.write("Id\n")
    
    accuracy_measurements = []
    loss_measurements = []
    
    for epoch in range(20):
    
        nextBatch, nextBatchLabels, reviewIds = getTestBatch()

        feed_dict = {
            graph.input_data: nextBatch,
            graph.labels: nextBatchLabels
        }

        accuracy_, loss_ = sess.run([graph.accuracy, graph.loss], feed_dict)
        
        accuracy_measurements.append(accuracy_)
        loss_measurements.append(loss_)

        if accuracy_ < 1.0:

            predictions_ = sess.run(graph.prediction, feed_dict)
            
            for index in range(len(predictions_)):

                if predictions_[index] != np.argmax(nextBatchLabels[index]):
                    csv.write(str(int(reviewIds[index])) + "\n")
                    
    print('Testing Results:')
    print('The average accuracy is: ', np.mean(accuracy_measurements))
    print('The average loss is: ', np.mean(loss_measurements))

## Train

Use the following:
tensorboard --logdir=tensorboard

In [20]:
import datetime

tf.reset_default_graph()
    
with tf.Session() as sess:
        
    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()

    # Initialize the graph
    sess.run(tf.global_variables_initializer())
    
    # Train model
    TrainModel(sess, sentimentGraph)
    
    # Save the model variables
    saver = tf.train.Saver()
    saver.save(sess, pathToCheckpoint)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Loss is:  0.73513705 ,  0  seconds
Loss is:  0.6984404 ,  67  seconds
Loss is:  0.70522165 ,  133  seconds
Loss is:  0.68207407 ,  199  seconds
Loss is:  0.7056764 ,  265  seconds
Loss is:  0.73364943 ,  331  seconds
Loss is:  0.6982748 ,  397  seconds
Loss is:  0.6906988 ,  463  seconds
Loss is:  0.6323196 ,  530  seconds
Loss is:  0.64474493 ,  596  seconds
Loss is:  0.6914451 ,  662  seconds
Loss is:  0.6795556 ,  728  seconds
Loss is:  0.69616675 ,  794  seconds
Loss is:  0.7011967 ,  860  seconds
Loss is:  0.6767094 ,  926  seconds
Loss is:  0.6773462 ,  992  seconds
Loss is:  0.67855865 ,  1058  seconds
Loss is:  0.6835862 ,  1124  seconds
Loss is:  0.65718466 ,  1191  seconds
Loss is:  0.62862915 ,  1257  seconds
Loss is:  0.65963346 ,  1323  seconds
Loss is:  0.65871096 ,  1389  seconds
Lo

Loss is:  0.26807606 ,  13767  seconds
Loss is:  0.15837936 ,  13833  seconds
Loss is:  0.09417925 ,  13899  seconds
Loss is:  0.2542978 ,  13966  seconds
Loss is:  0.1962748 ,  14032  seconds
Loss is:  0.2528983 ,  14098  seconds
Loss is:  0.23969913 ,  14164  seconds
Loss is:  0.2147023 ,  14230  seconds
Loss is:  0.19946782 ,  14296  seconds
Loss is:  0.14208232 ,  14362  seconds
Loss is:  0.22451814 ,  14428  seconds
Loss is:  0.37149408 ,  14494  seconds
Loss is:  0.24153613 ,  14560  seconds
Loss is:  0.078781456 ,  14627  seconds
Loss is:  0.10087118 ,  14693  seconds
Loss is:  0.2157792 ,  14759  seconds
Loss is:  0.37425718 ,  14825  seconds
Loss is:  0.1001982 ,  14891  seconds
Loss is:  0.18356828 ,  14958  seconds
Loss is:  0.28114736 ,  15024  seconds
Loss is:  0.25548646 ,  15090  seconds
Loss is:  0.26080656 ,  15157  seconds
Loss is:  0.3924642 ,  15223  seconds
Loss is:  0.27326497 ,  15289  seconds
Loss is:  0.3881944 ,  15355  seconds
Loss is:  0.2668226 ,  15421  se

## Test

In [21]:
tf.reset_default_graph()
    
with tf.Session() as sess:

    # Create the graph
    sentimentGraph = SentimentGraph()
    sentimentGraph.CreateGraph()
    
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess, pathToCheckpoint)

    # Test model
    TestModel(sess, sentimentGraph)

INFO:tensorflow:Restoring parameters from /tmp/binaryclassifier/model.ckpt
Testing Results:
The average accuracy is:  0.7895833
The average loss is:  0.69423616
