# Neural Bag of Words for Amazon Reviews

In [1]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
assert(tf.__version__.startswith("1.8"))

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz, treeviz
from w266_common import patched_numpy_io

# Helper libraries for Dave's instance
from w266_common import utils, vocabulary, tf_embed_viz, treeviz
from w266_common import patched_numpy_io

# Code for this assignment

import models

import nltk
from nltk.tokenize import word_tokenize

  from ._conv import register_converters as _register_converters


ModuleNotFoundError: No module named 'w266_common'

## Word Vectors

In [None]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

## Load Reviews

In [None]:
review_df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
review_df = review_df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

In [None]:
# Reduce size for development
numReviews = 100
review_df = review_df.loc[0:numReviews-1]

## Tokenize

In [None]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

## Train, Dev, Test data

In [None]:
# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.3
test_percent = 1 - (train_percent+dev_percent)

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = train_lower_index + round(len(review_df)*train_percent)
dev_lower_index   = train_upper_index+1
dev_upper_index   = dev_lower_index + round(len(review_df)*dev_percent)
test_lower_index  = dev_upper_index+1
test_upper_index  = len(review_df)-1

In [None]:
maxSeqLength = 250  # Determined by EDA

In [None]:
word_ids = np.zeros([review_df.shape[0], maxSeqLength], dtype=np.int32)
word_ids_ns = np.zeros([review_df.shape[0]], dtype=np.int32)
word_ids_labels = np.zeros([review_df.shape[0]], dtype=np.int32)

In [None]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

    word_ids_ns[sentence_index] = word_index

    if row['Score'] >= 3:
        word_ids_labels[sentence_index] = 1
    else:
        word_ids_labels[sentence_index] = 0

In [None]:
def Split(lower_idx, upper_idx):
    return word_ids[lower_idx:upper_idx], word_ids_ns[lower_idx:upper_idx], word_ids_labels[lower_idx:upper_idx]

In [None]:
train_x, train_ns, train_y = Split(train_lower_index, train_upper_index)
dev_x,   dev_ns,   dev_y   = Split(dev_lower_index, dev_upper_index)
test_x,  test_ns,  test_y  = Split(test_lower_index, test_upper_index)

## Model Parameters

In [None]:
import models; reload(models)

In [None]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=len(wordsList),
                    embed_dim=50,
                    hidden_dims=[25],
                    num_classes=2,      # 2 for binary classifier
                    encoder_type='bow',
                    lr=0.1,
                    optimizer='adagrad',
                    beta=0.01,
                    dropout_rate=0.1)  # fill this in

## Training Parameters

In [None]:
# Specify training schedule
train_params = dict(batch_size=32,
                    total_epochs=20,
                    eval_every=2)  # fill this in

In [None]:
checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")

if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

In [None]:
model = tf.estimator.Estimator(model_fn=models.classifier_model_fn,
                               params=model_params,
                               model_dir=checkpoint_dir)

print("\nTo view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006\n")

In [None]:
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns},
                    y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'],
                    shuffle=True,
                    seed=42)

dev_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns},
                    y=dev_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

## Train

In [None]:
for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    model.train(input_fn=train_input_fn)
    model.evaluate(input_fn=dev_input_fn, name="dev")

## Test

In [None]:
test_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns},
                    y=test_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))