# Neural Bag of Words for Amazon Reviews

In [1]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
assert(tf.__version__.startswith("1.8"))

# This creates a symbolic link on python in tmp directory

# Helper libraries for 's instance
from w266_common import utils, vocabulary, tf_embed_viz, treeviz
from w266_common import patched_numpy_io

# Code for this assignment

import models

import nltk
from nltk.tokenize import word_tokenize

## Word Vectors

In [2]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

## Load Reviews

In [3]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

In [4]:
# Reduce size for development
#numReviews = 100
#review_df = review_df.loc[0:numReviews-1]

## Tokenize

In [5]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

## Train, Dev, Test data

In [6]:
# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = train_lower_index + round(len(review_df)*train_percent)
dev_lower_index   = train_upper_index+1
dev_upper_index   = dev_lower_index + round(len(review_df)*dev_percent)
test_lower_index  = dev_upper_index+1
test_upper_index  = len(review_df)-1

In [7]:
maxSeqLength = 267  # Determined by EDA

In [8]:
word_ids = np.zeros([review_df.shape[0], maxSeqLength], dtype=np.int32)
word_ids_ns = np.zeros([review_df.shape[0]], dtype=np.int32)
word_ids_labels = np.zeros([review_df.shape[0]], dtype=np.int32)

In [9]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

    word_ids_ns[sentence_index] = word_index

    if row['Score'] >= 3:
        word_ids_labels[sentence_index] = 1
    else:
        word_ids_labels[sentence_index] = 0

In [10]:
def Split(lower_idx, upper_idx):
    return word_ids[lower_idx:upper_idx], word_ids_ns[lower_idx:upper_idx], word_ids_labels[lower_idx:upper_idx]

In [11]:
train_x, train_ns, train_y = Split(train_lower_index, train_upper_index)
dev_x,   dev_ns,   dev_y   = Split(dev_lower_index, dev_upper_index)
test_x,  test_ns,  test_y  = Split(test_lower_index, test_upper_index)

## Model Parameters

In [12]:
import models; reload(models)

<module 'models' from '/home/matt/w266_final_project/NBOW/models.py'>

In [13]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=len(wordsList),
                    embed_dim=50,
                    hidden_dims=[25],
                    num_classes=2,      # 2 for binary classifier
                    encoder_type='bow',
                    lr=0.1,
                    optimizer='adagrad',
                    beta=0.01,
                    dropout_rate=0.1)  # fill this in

## Training Parameters

In [14]:
# Specify training schedule
train_params = dict(batch_size=32,
                    total_epochs=20,
                    eval_every=2)  # fill this in

In [15]:
checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")

if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

In [16]:
model = tf.estimator.Estimator(model_fn=models.classifier_model_fn,
                               params=model_params,
                               model_dir=checkpoint_dir)

print("\nTo view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006\n")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_is_chief': True, '_session_config': None, '_keep_checkpoint_max': 5, '_model_dir': '/tmp/tf_bow_sst_20180716-1957', '_save_checkpoints_steps': None, '_task_type': 'worker', '_save_summary_steps': 100, '_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fddaa5f77f0>, '_evaluation_master': '', '_train_distribute': None, '_tf_random_seed': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_global_id_in_cluster': 0, '_num_worker_replicas': 1, '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180716-1957' --port 6006

Then in your browser, open: http://localhost:6006



In [17]:
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns},
                    y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'],
                    shuffle=True,
                    seed=42)

dev_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns},
                    y=dev_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

## Train

In [None]:
for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    model.train(input_fn=train_input_fn)
    model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180716-1957/model.ckpt.
INFO:tensorflow:step = 1, loss = 1.0669179
INFO:tensorflow:global_step/sec: 133.706
INFO:tensorflow:step = 101, loss = 0.8419497 (0.749 sec)
INFO:tensorflow:global_step/sec: 151.011
INFO:tensorflow:step = 201, loss = 0.69379115 (0.662 sec)
INFO:tensorflow:global_step/sec: 150.749
INFO:tensorflow:step = 301, loss = 0.65251577 (0.663 sec)
INFO:tensorflow:global_step/sec: 151.319
INFO:tensorflow:step = 401, loss = 0.5651376 (0.661 sec)
INFO:tensorflow:global_step/sec: 151.567
INFO:tensorflow:step = 501, loss = 0.6205501 (0.660 sec)
INFO:tensorflow:global_step/sec: 154.354
INFO:tensorflow:step = 601, loss = 0.80715436 (0.648 sec)
INFO:tensorflow:global_step/sec: 153.455


INFO:tensorflow:global_step/sec: 154.074
INFO:tensorflow:step = 8001, loss = 0.46782455 (0.649 sec)
INFO:tensorflow:global_step/sec: 153.441
INFO:tensorflow:step = 8101, loss = 0.28226662 (0.652 sec)
INFO:tensorflow:global_step/sec: 154.838
INFO:tensorflow:step = 8201, loss = 0.48881003 (0.646 sec)
INFO:tensorflow:global_step/sec: 152.425
INFO:tensorflow:step = 8301, loss = 0.1522807 (0.656 sec)
INFO:tensorflow:global_step/sec: 154.177
INFO:tensorflow:step = 8401, loss = 0.24422354 (0.649 sec)
INFO:tensorflow:global_step/sec: 151.991
INFO:tensorflow:step = 8501, loss = 0.23373565 (0.658 sec)
INFO:tensorflow:global_step/sec: 153.661
INFO:tensorflow:step = 8601, loss = 0.33318967 (0.651 sec)
INFO:tensorflow:global_step/sec: 154.358
INFO:tensorflow:step = 8701, loss = 0.29639566 (0.648 sec)
INFO:tensorflow:global_step/sec: 157.611
INFO:tensorflow:step = 8801, loss = 0.43716794 (0.635 sec)
INFO:tensorflow:global_step/sec: 150.613
INFO:tensorflow:step = 8901, loss = 0.43872315 (0.664 sec)
I

INFO:tensorflow:global_step/sec: 153.79
INFO:tensorflow:step = 16201, loss = 0.1954542 (0.650 sec)
INFO:tensorflow:global_step/sec: 152.5
INFO:tensorflow:step = 16301, loss = 0.2495805 (0.656 sec)
INFO:tensorflow:global_step/sec: 152.228
INFO:tensorflow:step = 16401, loss = 0.2959849 (0.657 sec)
INFO:tensorflow:global_step/sec: 154.735
INFO:tensorflow:step = 16501, loss = 0.31862655 (0.646 sec)
INFO:tensorflow:global_step/sec: 153.119
INFO:tensorflow:step = 16601, loss = 0.73399556 (0.653 sec)
INFO:tensorflow:global_step/sec: 153.573
INFO:tensorflow:step = 16701, loss = 0.3150748 (0.651 sec)
INFO:tensorflow:global_step/sec: 153.088
INFO:tensorflow:step = 16801, loss = 0.31691894 (0.653 sec)
INFO:tensorflow:global_step/sec: 151.083
INFO:tensorflow:step = 16901, loss = 0.31591785 (0.662 sec)
INFO:tensorflow:global_step/sec: 155.219
INFO:tensorflow:step = 17001, loss = 0.27460468 (0.644 sec)
INFO:tensorflow:global_step/sec: 153.082
INFO:tensorflow:step = 17101, loss = 0.24159057 (0.653 se

INFO:tensorflow:step = 23218, loss = 0.26645288 (0.674 sec)
INFO:tensorflow:global_step/sec: 144.82
INFO:tensorflow:step = 23318, loss = 0.24123272 (0.691 sec)
INFO:tensorflow:global_step/sec: 146.28
INFO:tensorflow:step = 23418, loss = 0.22011092 (0.684 sec)
INFO:tensorflow:global_step/sec: 149.092
INFO:tensorflow:step = 23518, loss = 0.40201825 (0.671 sec)
INFO:tensorflow:global_step/sec: 147.504
INFO:tensorflow:step = 23618, loss = 0.22855562 (0.678 sec)
INFO:tensorflow:global_step/sec: 147.456
INFO:tensorflow:step = 23718, loss = 0.3188772 (0.678 sec)
INFO:tensorflow:global_step/sec: 150.159
INFO:tensorflow:step = 23818, loss = 0.23210402 (0.666 sec)
INFO:tensorflow:global_step/sec: 145.786
INFO:tensorflow:step = 23918, loss = 0.456314 (0.686 sec)
INFO:tensorflow:global_step/sec: 146.648
INFO:tensorflow:step = 24018, loss = 0.26345894 (0.682 sec)
INFO:tensorflow:global_step/sec: 148.251
INFO:tensorflow:step = 24118, loss = 0.19339655 (0.675 sec)
INFO:tensorflow:global_step/sec: 148

INFO:tensorflow:global_step/sec: 149.382
INFO:tensorflow:step = 31418, loss = 0.30943364 (0.669 sec)
INFO:tensorflow:global_step/sec: 147.12
INFO:tensorflow:step = 31518, loss = 0.1802097 (0.680 sec)
INFO:tensorflow:global_step/sec: 145.456
INFO:tensorflow:step = 31618, loss = 0.28916904 (0.687 sec)
INFO:tensorflow:global_step/sec: 148.023
INFO:tensorflow:step = 31718, loss = 0.2195368 (0.676 sec)
INFO:tensorflow:global_step/sec: 151.589
INFO:tensorflow:step = 31818, loss = 0.2431789 (0.660 sec)
INFO:tensorflow:global_step/sec: 145.226
INFO:tensorflow:step = 31918, loss = 0.17933375 (0.689 sec)
INFO:tensorflow:global_step/sec: 144.999
INFO:tensorflow:step = 32018, loss = 0.19065793 (0.690 sec)
INFO:tensorflow:global_step/sec: 151.103
INFO:tensorflow:step = 32118, loss = 0.30745992 (0.662 sec)
INFO:tensorflow:global_step/sec: 146.623
INFO:tensorflow:step = 32218, loss = 0.43614066 (0.682 sec)
INFO:tensorflow:global_step/sec: 153.115
INFO:tensorflow:step = 32318, loss = 0.18155742 (0.653

INFO:tensorflow:global_step/sec: 150.443
INFO:tensorflow:step = 39618, loss = 0.19555624 (0.664 sec)
INFO:tensorflow:global_step/sec: 152.761
INFO:tensorflow:step = 39718, loss = 0.28709245 (0.655 sec)
INFO:tensorflow:global_step/sec: 148.371
INFO:tensorflow:step = 39818, loss = 0.27633265 (0.674 sec)
INFO:tensorflow:global_step/sec: 147.973
INFO:tensorflow:step = 39918, loss = 0.124420375 (0.676 sec)
INFO:tensorflow:global_step/sec: 151.346
INFO:tensorflow:step = 40018, loss = 0.41222662 (0.661 sec)
INFO:tensorflow:global_step/sec: 146.109
INFO:tensorflow:step = 40118, loss = 0.29541996 (0.685 sec)
INFO:tensorflow:global_step/sec: 147.87
INFO:tensorflow:step = 40218, loss = 0.34863433 (0.676 sec)
INFO:tensorflow:global_step/sec: 151.06
INFO:tensorflow:step = 40318, loss = 0.3571859 (0.662 sec)
INFO:tensorflow:global_step/sec: 150.262
INFO:tensorflow:step = 40418, loss = 0.20200679 (0.667 sec)
INFO:tensorflow:global_step/sec: 149.455
INFO:tensorflow:step = 40518, loss = 0.26760036 (0.6

INFO:tensorflow:global_step/sec: 148.271
INFO:tensorflow:step = 46635, loss = 0.3315764 (0.674 sec)
INFO:tensorflow:global_step/sec: 147.075
INFO:tensorflow:step = 46735, loss = 0.26368695 (0.680 sec)
INFO:tensorflow:global_step/sec: 147.904
INFO:tensorflow:step = 46835, loss = 0.21220496 (0.676 sec)
INFO:tensorflow:global_step/sec: 150.293
INFO:tensorflow:step = 46935, loss = 0.17751189 (0.665 sec)
INFO:tensorflow:global_step/sec: 148.647
INFO:tensorflow:step = 47035, loss = 0.2522796 (0.673 sec)
INFO:tensorflow:global_step/sec: 148.678
INFO:tensorflow:step = 47135, loss = 0.33620104 (0.673 sec)
INFO:tensorflow:global_step/sec: 148.045
INFO:tensorflow:step = 47235, loss = 0.23828769 (0.675 sec)
INFO:tensorflow:global_step/sec: 147.356
INFO:tensorflow:step = 47335, loss = 0.1510463 (0.679 sec)
INFO:tensorflow:global_step/sec: 146.296
INFO:tensorflow:step = 47435, loss = 0.5819788 (0.683 sec)
INFO:tensorflow:global_step/sec: 149.959
INFO:tensorflow:step = 47535, loss = 0.22551681 (0.667

INFO:tensorflow:step = 54735, loss = 0.3303502 (0.666 sec)
INFO:tensorflow:global_step/sec: 149.905
INFO:tensorflow:step = 54835, loss = 0.124507174 (0.667 sec)
INFO:tensorflow:global_step/sec: 146.967
INFO:tensorflow:step = 54935, loss = 0.3473299 (0.681 sec)
INFO:tensorflow:global_step/sec: 149.418
INFO:tensorflow:step = 55035, loss = 0.21864077 (0.669 sec)
INFO:tensorflow:global_step/sec: 151.989
INFO:tensorflow:step = 55135, loss = 0.30540618 (0.658 sec)
INFO:tensorflow:global_step/sec: 146.685
INFO:tensorflow:step = 55235, loss = 0.27373025 (0.682 sec)
INFO:tensorflow:global_step/sec: 150.428
INFO:tensorflow:step = 55335, loss = 0.2692796 (0.665 sec)
INFO:tensorflow:global_step/sec: 150.704
INFO:tensorflow:step = 55435, loss = 0.3391435 (0.664 sec)
INFO:tensorflow:global_step/sec: 145.995
INFO:tensorflow:step = 55535, loss = 0.2861011 (0.685 sec)
INFO:tensorflow:global_step/sec: 146.534
INFO:tensorflow:step = 55635, loss = 0.2614666 (0.682 sec)
INFO:tensorflow:global_step/sec: 148

INFO:tensorflow:global_step/sec: 148.832
INFO:tensorflow:step = 62935, loss = 0.23772784 (0.672 sec)
INFO:tensorflow:global_step/sec: 148.418
INFO:tensorflow:step = 63035, loss = 0.3942246 (0.674 sec)
INFO:tensorflow:global_step/sec: 150.657
INFO:tensorflow:step = 63135, loss = 0.14987148 (0.664 sec)
INFO:tensorflow:global_step/sec: 148.265
INFO:tensorflow:step = 63235, loss = 0.17430514 (0.675 sec)
INFO:tensorflow:global_step/sec: 152.667
INFO:tensorflow:step = 63335, loss = 0.34184623 (0.655 sec)
INFO:tensorflow:global_step/sec: 146.452
INFO:tensorflow:step = 63435, loss = 0.22505178 (0.684 sec)
INFO:tensorflow:global_step/sec: 145.872
INFO:tensorflow:step = 63535, loss = 0.45064646 (0.684 sec)
INFO:tensorflow:global_step/sec: 149.239
INFO:tensorflow:step = 63635, loss = 0.15170148 (0.670 sec)
INFO:tensorflow:global_step/sec: 149.058
INFO:tensorflow:step = 63735, loss = 0.29827106 (0.671 sec)
INFO:tensorflow:global_step/sec: 147.87
INFO:tensorflow:step = 63835, loss = 0.38204017 (0.6

INFO:tensorflow:global_step/sec: 112.525
INFO:tensorflow:step = 69952, loss = 0.2086315 (0.889 sec)
INFO:tensorflow:global_step/sec: 113.864
INFO:tensorflow:step = 70052, loss = 0.18232298 (0.879 sec)
INFO:tensorflow:global_step/sec: 115.39
INFO:tensorflow:step = 70152, loss = 0.23467731 (0.867 sec)
INFO:tensorflow:global_step/sec: 112.58
INFO:tensorflow:step = 70252, loss = 0.109007604 (0.888 sec)
INFO:tensorflow:global_step/sec: 113.841
INFO:tensorflow:step = 70352, loss = 0.20999017 (0.878 sec)
INFO:tensorflow:global_step/sec: 118.58
INFO:tensorflow:step = 70452, loss = 0.29199043 (0.843 sec)
INFO:tensorflow:global_step/sec: 114.093
INFO:tensorflow:step = 70552, loss = 0.20067357 (0.877 sec)
INFO:tensorflow:global_step/sec: 114.605
INFO:tensorflow:step = 70652, loss = 0.14576843 (0.872 sec)
INFO:tensorflow:global_step/sec: 113.601
INFO:tensorflow:step = 70752, loss = 0.18800843 (0.880 sec)
INFO:tensorflow:global_step/sec: 115.341
INFO:tensorflow:step = 70852, loss = 0.26044437 (0.86

INFO:tensorflow:global_step/sec: 119.79
INFO:tensorflow:step = 78152, loss = 0.25484428 (0.835 sec)
INFO:tensorflow:global_step/sec: 117.558
INFO:tensorflow:step = 78252, loss = 0.20307812 (0.851 sec)
INFO:tensorflow:global_step/sec: 114.972
INFO:tensorflow:step = 78352, loss = 0.14989738 (0.869 sec)
INFO:tensorflow:global_step/sec: 117.703
INFO:tensorflow:step = 78452, loss = 0.3162644 (0.850 sec)
INFO:tensorflow:global_step/sec: 111.961
INFO:tensorflow:step = 78552, loss = 0.20752393 (0.893 sec)
INFO:tensorflow:global_step/sec: 114.953
INFO:tensorflow:step = 78652, loss = 0.1471021 (0.870 sec)
INFO:tensorflow:global_step/sec: 117.194
INFO:tensorflow:step = 78752, loss = 0.22612415 (0.855 sec)
INFO:tensorflow:global_step/sec: 116.417
INFO:tensorflow:step = 78852, loss = 0.366576 (0.857 sec)
INFO:tensorflow:global_step/sec: 114.256
INFO:tensorflow:step = 78952, loss = 0.22439986 (0.877 sec)
INFO:tensorflow:global_step/sec: 113.969
INFO:tensorflow:step = 79052, loss = 0.321012 (0.876 se

INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 85269 into /tmp/tf_bow_sst_20180716-1957/model.ckpt.
INFO:tensorflow:step = 85269, loss = 0.1860573
INFO:tensorflow:global_step/sec: 97.3188
INFO:tensorflow:step = 85369, loss = 0.29845554 (1.025 sec)
INFO:tensorflow:global_step/sec: 113.535
INFO:tensorflow:step = 85469, loss = 0.20672464 (0.880 sec)
INFO:tensorflow:global_step/sec: 116.871
INFO:tensorflow:step = 85569, loss = 0.33948398 (0.856 sec)
INFO:tensorflow:global_step/sec: 114.206
INFO:tensorflow:step = 85669, loss = 0.37586808 (0.875 sec)
INFO:tensorflow:global_step/sec: 113.938
INFO:tensorflow:step = 85769, loss = 0.3255954 (0.883 sec)
INFO:tensorflow:global_step/sec: 117.466
INFO:tensorflow:step = 85869, loss = 0.420182 (0.848 sec)
INFO:tensorflow:global_step/sec: 115.725
INFO:tensorflow:step = 85969, loss = 0.20294131 (0.865 sec)
INFO:tensorflow:global_step/sec: 113.933
INFO:tensorflow:step = 86069, loss = 0.15488249 (0.877 sec)
INFO:tensorf

## Test

In [None]:
test_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns},
                    y=test_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))