In [101]:
import numpy as np
import pandas as pd

RNN Model Parameters

In [102]:
batchSize = 2
lstmUnits = 64
numClasses = 2
iterations = 100000

## Word Vectors

In [103]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

Word Vectors have dimension 300

In [104]:
numDimensions = 300

## Reviews

Load the Reviews

In [105]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [106]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

For development, just use 1000 rows for simplicity and speed

In [107]:
review_df = review_df.loc[0:999]

Lowercase, and then tokenize the reviews.  The tokens need to be lowercase for the embedding lookup.

In [108]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

Create the array of input sentences converted to word IDs

In [109]:
maxSeqLength = 250
numFiles = 1000

In [110]:
word_ids = np.zeros((numFiles, maxSeqLength), dtype='int32')

Convert words to word IDs and store in word_ids

In [111]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

Create functions to get the train and test batches

In [112]:
# Train with 70%, test with 30%
train_percent = 0.7
test_percent = 1-train_percent

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = round(len(review_df)*train_percent)
test_lower_index = train_upper_index+1
test_upper_index = len(df)-1

In [113]:
next_train_index = 0
next_test_index = 0

TODO: Look into replacing these functions with Tensorflow's data iterators

In [125]:
def getTrainBatch(dataset):
    global next_train_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_train_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_train_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_train_index + i]
        
        if review_df.iloc[index].Score >= 3:
            label_batch[i] = [0, 1]
        else:
            label_batch[i] = [1, 0]
        
    next_train_index = next_train_index + next_batch
    
    return continue_iterating, word_batch, label_batch

def getTestBatch(dataset):
    global next_test_index
    
    data_len = len(dataset)
    
    word_batch = np.zeros([batchSize, maxSeqLength])
    label_batch = np.zeros([batchSize, numClasses])
    
    continue_iterating = True
    
    if ( (next_test_index + batchSize) > data_len ):
        continue_iterating = False
    
    next_batch = min(batchSize, (data_len - next_test_index))

    for i in range(next_batch):
        word_batch[i] = word_ids[next_test_index + i]
        
        if review_df.iloc[index].Score >= 3:
            label_batch[i] = [0, 1]
        else:
            label_batch[i] = [1, 0]
        
    next_test_index = next_test_index + next_batch
    
    return continue_iterating, word_batch, label_batch

def resetBatch():
    global next_train_index
    global next_test_index
    
    next_train_index = 0
    next_test_index = 0

## Tensorflow Model

In [115]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

In [116]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)

In [117]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [118]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]), dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]), dtype=tf.float32)
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [119]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Train Model

In [130]:
sess = tf.InteractiveSession()
#saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())



In [131]:
import datetime

tf.summary.scalar('Loss', loss)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [132]:
resetBatch()

In [133]:
while True:
    
    # Next Batch of reviews
    continue_iterating, nextBatch, nextBatchLabels = getTrainBatch(review_df)
    
    feed_dict_ = {
        input_data: nextBatch,
        labels: nextBatchLabels
    }
    
    loss_, _ = sess.run([loss, optimizer], feed_dict=feed_dict_)
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

    if (i % 500 == 0):
        print("Loss is: ", loss_, "\n")
        
    if not continue_iterating:
        break

writer.close()

Loss is:  0.6931472 

Loss is:  0.6588851 

Loss is:  0.6287744 

Loss is:  0.58440405 

Loss is:  0.5427661 

Loss is:  0.5044264 

Loss is:  0.44538435 

Loss is:  0.38684908 

Loss is:  0.31886077 

Loss is:  0.26486444 

Loss is:  0.10051168 

Loss is:  0.07077278 

Loss is:  0.006014003 

Loss is:  4.726447e-05 

Loss is:  1.19209275e-07 

Loss is:  3.5762775e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  

## Test Model

TODO: Need to be able to test one sentence at a time and display the loss

In [126]:
while True:
    
    continue_iterating, nextBatch, nextBatchLabels = getTestBatch(review_df);
    
    feed_dict = {
        input_data: nextBatch,
        labels: nextBatchLabels
    }
    
    loss_ = sess.run(loss, feed_dict)
    print("Loss is: ", loss_, "\n")
    
    if not continue_iterating:
        break

Loss is:  0.0 

Loss is:  1.788139e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  1.19209275e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  1.1920928e-07 

Loss is:  0.0 

Loss is:  1.19209275e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  8.940689e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  2.9802268e-06 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.125973e-06 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  4.1723234e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  2.3841852e-07 

Loss is:  4.7683693e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  0.0 

Loss i

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  4.410724e-06 

Loss is:  0.0 

Loss is:  1.7881073e-05 

Loss is:  4.768349e-06 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  1.788139e-07 

Loss is:  5.960464e-08 

Loss is:  5.9604616e-07 

Loss is:  0.0 

Loss is:  1.788139e-07 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  3.5762778e-07 

Loss is:  0.0 

Loss is:  2.3841852e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  6.5565064e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  1.19209275e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  5.960461e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  1.19209275e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  2.3841852e-07 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  5.960464e-08 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.0 

Loss is:  0.

Close the session

In [129]:
sess.close()