In [1]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [2]:
batchSize = 24
#lstmUnits = 64 # Not used
numClasses = 2  # Binary classification
#numClasses = 5
hiddenSize = 50

## Word Vectors

In [3]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 50

In [4]:
embedding_dimension = 50

## Reviews

Load the Reviews

In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

For development, just use 5000 rows for simplicity and speed

In [7]:
one_df = review_df[review_df.Score == 1]
one_df.reset_index()

five_df = review_df[review_df.Score == 5]
five_df.reset_index()

Unnamed: 0,index,Score,Text
0,0,5,I have bought several of the Vitality canned d...
1,4,5,Great taffy at a great price. There was a wid...
2,6,5,This saltwater taffy had great flavors and was...
3,7,5,This taffy is so good. It is very soft and ch...
4,8,5,Right now I'm mostly just sprouting this so my...
5,9,5,This is a very healthy dog food. Good for thei...
6,10,5,I don't know if it's the cactus or the tequila...
7,11,5,One of my boys needed to lose some weight and ...
8,14,5,The Strawberry Twizzlers are my guilty pleasur...
9,15,5,My daughter loves twizzlers and this shipment ...


In [8]:
#numReviews = 12500  # This works
numReviews = 25000

In [9]:
one_df = one_df[0:numReviews]
five_df = five_df[0:numReviews]

Lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [10]:
# one_df['Length'] = one_df['Tokens'].apply(lambda token_list: len(token_list))
# one_df['Length'].min()
# five_df['Length'] = five_df['Tokens'].apply(lambda token_list: len(token_list))
# five_df['Length'].min()

In [11]:
import re

strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    return word_tokenize(re.sub(strip_special_chars, " ", string.lower()))

one_df['Tokens'] = one_df['Text'].apply(lambda text: cleanSentences(text))
five_df['Tokens'] = five_df['Text'].apply(lambda text: cleanSentences(text))

Create the array of input sentences converted to word IDs

In [12]:
#maxSeqLength = 250  # Determined by EDA
maxSeqLength = 200  # This works, in combination with numReviews = 12500

In [13]:
word_ids = np.zeros((2*numReviews, maxSeqLength), dtype='int32')

Convert words to word IDs and store in word_ids

In [None]:
sentence_index = 0

for _, row in one_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break
    
    sentence_index = sentence_index + 1

for _, row in five_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break
    
    sentence_index = sentence_index + 1


Create functions to get the train and test batches

TODO: Look into replacing these functions with Tensorflow's data iterators

In [None]:
from random import randint

def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,numReviews)
            labels.append([1,0])
        else:
            num = randint(numReviews+1,2*numReviews)
            labels.append([0,1])
        arr[i] = word_ids[num-1:num]
    return arr, labels

# def getTestBatch():
#     labels = []
#     arr = np.zeros([batchSize, maxSeqLength])
#     for i in range(batchSize):
#         num = randint(11499,13499)
#         if (num <= 12499):
#             labels.append([1,0])
#         else:
#             labels.append([0,1])
#         arr[i] = ids[num-1:num]
#     return arr, labels

Reset the batch iterators

## Tensorflow Model

Hyper Parameters

In [None]:
learning_rate =  0.001
dropout_keep_prob = 0.75
#dropout_keep_prob = 0.9

Model

In [None]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, embedding_dimension]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenSize)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropout_keep_prob)
rnn_out, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

W_out = tf.Variable(tf.truncated_normal([hiddenSize, numClasses]), dtype=tf.float32)
b_out = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)

# Get the output of the last RNN cell
rnn_out = tf.transpose(rnn_out, [1, 0, 2])
last_cell_out = tf.gather(rnn_out, int(rnn_out.get_shape()[0]) - 1)

# Calculate logits
logits = (tf.matmul(last_cell_out, W_out) + b_out)

# Calculate prediction and accuracy
prediction = tf.argmax(logits,1)
correctPred = tf.equal(prediction, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

# Loss function and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

## Train Model

In [None]:
import datetime

def TrainModel(session, writer):
    
    start_time = datetime.datetime.now()
    
    i = 0  # Must stay outside the loops
    
    for epoch in range(100000):
        
        # Next Batch of reviews
        nextBatch, nextBatchLabels = getTrainBatch()

        feed_dict_ = {
            input_data: nextBatch,
            labels: nextBatchLabels
        }

        loss_, _ = session.run([loss, optimizer], feed_dict=feed_dict_)

        #Write summary to Tensorboard
        if (i % 10 == 0):
            summary = session.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, i)

        if (i % 500 == 0):
            print("Loss is: ", loss_, ", ", (datetime.datetime.now() - start_time).seconds, " seconds")

        i = i + 1

## Train and Test

Use the following:
tensorboard --logdir=tensorboard

In [None]:
import datetime

with tf.Session() as sess:
    #saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)
        
    # Train model
    TrainModel(sess, writer)
    
    # Test model
    #TestModel(sess)
    
    writer.close()