# Neural Bag of Words for RateBeer

In [1]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
assert(tf.__version__.startswith("1.8"))

# This creates a symbolic link on python in tmp directory

# Helper libraries for 's instance
from w266_common import utils, vocabulary, tf_embed_viz, treeviz
from w266_common import patched_numpy_io

# Code for this assignment

import models

import nltk
from nltk.tokenize import word_tokenize

## Word Vectors

In [2]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

## Load Reviews

In [3]:
review_df = pd.read_csv('../data_prep/rate_beer_binary_medium.csv', encoding='utf8')

## Tokenize

In [4]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

## Train, Dev, Test data

In [5]:
# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = train_lower_index + round(len(review_df)*train_percent)
dev_lower_index   = train_upper_index+1
dev_upper_index   = dev_lower_index + round(len(review_df)*dev_percent)
test_lower_index  = dev_upper_index+1
test_upper_index  = len(review_df)-1

In [6]:
maxSeqLength = 267  # Determined by EDA

In [7]:
word_ids = np.zeros([review_df.shape[0], maxSeqLength], dtype=np.int32)
word_ids_ns = np.zeros([review_df.shape[0]], dtype=np.int32)
word_ids_labels = np.zeros([review_df.shape[0]], dtype=np.int32)

In [8]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

    word_ids_ns[sentence_index] = word_index

    if row['Score'] > 3:
        word_ids_labels[sentence_index] = 1
    elif row['Score'] < 3:
        word_ids_labels[sentence_index] = 0

In [9]:
def Split(lower_idx, upper_idx):
    return word_ids[lower_idx:upper_idx], word_ids_ns[lower_idx:upper_idx], word_ids_labels[lower_idx:upper_idx]

In [10]:
train_x, train_ns, train_y = Split(train_lower_index, train_upper_index)
dev_x,   dev_ns,   dev_y   = Split(dev_lower_index, dev_upper_index)
test_x,  test_ns,  test_y  = Split(test_lower_index, test_upper_index)

## Model Parameters

In [11]:
import models; reload(models)

<module 'models' from '/home/matt/w266_final_project/NBOW/models.py'>

In [12]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=len(wordsList),
                    embed_dim=50,
                    hidden_dims=[25],
                    num_classes=2,      # 2 for binary classifier
                    encoder_type='bow',
                    lr=0.1,
                    optimizer='adagrad',
                    beta=0.01,
                    dropout_rate=0.1)  # fill this in

## Training Parameters

In [13]:
# Specify training schedule
train_params = dict(batch_size=32,
                    total_epochs=20,
                    eval_every=2)  # fill this in

In [14]:
checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")

if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

In [15]:
model = tf.estimator.Estimator(model_fn=models.classifier_model_fn,
                               params=model_params,
                               model_dir=checkpoint_dir)

print("\nTo view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006\n")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_train_distribute': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9d9ae77f98>, '_tf_random_seed': None, '_task_id': 0, '_keep_checkpoint_max': 5, '_model_dir': '/tmp/tf_bow_sst_20180806-1112', '_num_worker_replicas': 1, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_evaluation_master': '', '_master': '', '_task_type': 'worker', '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_global_id_in_cluster': 0, '_is_chief': True, '_save_summary_steps': 100, '_num_ps_replicas': 0}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180806-1112' --port 6006

Then in your browser, open: http://localhost:6006



In [16]:
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns},
                    y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'],
                    shuffle=True,
                    seed=42)

dev_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns},
                    y=dev_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

## Train

In [17]:
for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    model.train(input_fn=train_input_fn)
    model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180806-1112/model.ckpt.
INFO:tensorflow:loss = 1.0866891, step = 1
INFO:tensorflow:global_step/sec: 130.074
INFO:tensorflow:loss = 1.2514586, step = 101 (0.770 sec)
INFO:tensorflow:global_step/sec: 141.03
INFO:tensorflow:loss = 0.6938255, step = 201 (0.709 sec)
INFO:tensorflow:global_step/sec: 149.633
INFO:tensorflow:loss = 0.3372344, step = 301 (0.668 sec)
INFO:tensorflow:global_step/sec: 150.158
INFO:tensorflow:loss = 0.78381675, step = 401 (0.666 sec)
INFO:tensorflow:global_step/sec: 140.6
INFO:tensorflow:loss = 0.7844502, step = 501 (0.711 sec)
INFO:tensorflow:global_step/sec: 145.47
INFO:tensorflow:loss = 0.5304333, step = 601 (0.687 sec)
INFO:tensorflow:global_step/sec: 139.728
INFO:t

INFO:tensorflow:global_step/sec: 155.825
INFO:tensorflow:loss = 0.5528286, step = 5859 (0.642 sec)
INFO:tensorflow:global_step/sec: 152.525
INFO:tensorflow:loss = 0.34307146, step = 5959 (0.656 sec)
INFO:tensorflow:global_step/sec: 155.149
INFO:tensorflow:loss = 0.6826588, step = 6059 (0.644 sec)
INFO:tensorflow:global_step/sec: 154.701
INFO:tensorflow:loss = 0.57454747, step = 6159 (0.647 sec)
INFO:tensorflow:global_step/sec: 149.716
INFO:tensorflow:loss = 0.5086871, step = 6259 (0.668 sec)
INFO:tensorflow:global_step/sec: 151.92
INFO:tensorflow:loss = 0.47786918, step = 6359 (0.658 sec)
INFO:tensorflow:global_step/sec: 147.346
INFO:tensorflow:loss = 0.45035273, step = 6459 (0.679 sec)
INFO:tensorflow:global_step/sec: 151.518
INFO:tensorflow:loss = 0.653935, step = 6559 (0.660 sec)
INFO:tensorflow:global_step/sec: 150.888
INFO:tensorflow:loss = 0.45558855, step = 6659 (0.663 sec)
INFO:tensorflow:global_step/sec: 152.77
INFO:tensorflow:loss = 0.39192724, step = 6759 (0.655 sec)
INFO:te

INFO:tensorflow:loss = 0.63990414, step = 11917 (0.642 sec)
INFO:tensorflow:global_step/sec: 123.334
INFO:tensorflow:loss = 0.4526822, step = 12017 (0.811 sec)
INFO:tensorflow:global_step/sec: 135.488
INFO:tensorflow:loss = 0.34753275, step = 12117 (0.738 sec)
INFO:tensorflow:global_step/sec: 139.012
INFO:tensorflow:loss = 0.42806676, step = 12217 (0.719 sec)
INFO:tensorflow:global_step/sec: 158.147
INFO:tensorflow:loss = 0.3754716, step = 12317 (0.632 sec)
INFO:tensorflow:global_step/sec: 150.92
INFO:tensorflow:loss = 0.35284504, step = 12417 (0.663 sec)
INFO:tensorflow:global_step/sec: 156.556
INFO:tensorflow:loss = 0.41960835, step = 12517 (0.639 sec)
INFO:tensorflow:global_step/sec: 153.446
INFO:tensorflow:loss = 0.47681555, step = 12617 (0.652 sec)
INFO:tensorflow:global_step/sec: 151.183
INFO:tensorflow:loss = 0.33245546, step = 12717 (0.661 sec)
INFO:tensorflow:global_step/sec: 156.105
INFO:tensorflow:loss = 0.4436457, step = 12817 (0.641 sec)
INFO:tensorflow:global_step/sec: 15

INFO:tensorflow:loss = 0.43167493, step = 17975 (0.667 sec)
INFO:tensorflow:global_step/sec: 155.067
INFO:tensorflow:loss = 0.32351458, step = 18075 (0.645 sec)
INFO:tensorflow:global_step/sec: 157.424
INFO:tensorflow:loss = 0.444236, step = 18175 (0.635 sec)
INFO:tensorflow:global_step/sec: 148.921
INFO:tensorflow:loss = 0.3208844, step = 18275 (0.671 sec)
INFO:tensorflow:global_step/sec: 153.053
INFO:tensorflow:loss = 0.6179539, step = 18375 (0.653 sec)
INFO:tensorflow:global_step/sec: 149.713
INFO:tensorflow:loss = 0.53429717, step = 18475 (0.668 sec)
INFO:tensorflow:global_step/sec: 153.068
INFO:tensorflow:loss = 0.45707932, step = 18575 (0.653 sec)
INFO:tensorflow:global_step/sec: 148.629
INFO:tensorflow:loss = 0.34504402, step = 18675 (0.673 sec)
INFO:tensorflow:Saving checkpoints for 18753 into /tmp/tf_bow_sst_20180806-1112/model.ckpt.
INFO:tensorflow:Loss for final step: 0.33754092.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Startin

INFO:tensorflow:loss = 0.3417241, step = 24033 (0.662 sec)
INFO:tensorflow:Saving checkpoints for 24111 into /tmp/tf_bow_sst_20180806-1112/model.ckpt.
INFO:tensorflow:Loss for final step: 0.32631397.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-18:15:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1112/model.ckpt-24111
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-18:16:00
INFO:tensorflow:Saving dict for global step 24111: accuracy = 0.74618506, cross_entropy_loss = 0.60032564, global_step = 24111, loss = 0.8169865
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1112/model.ckpt-24111
INFO:tensorf

## Test

In [18]:
test_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns},
                    y=test_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-18:16:23
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1112/model.ckpt-26790
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-18:16:25
INFO:tensorflow:Saving dict for global step 26790: accuracy = 0.7382964, cross_entropy_loss = 0.61276066, global_step = 26790, loss = 0.8435243
Accuracy on test set: 73.83%
