# Neural Bag of Words for IMDB

In [37]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
assert(tf.__version__.startswith("1.8"))

# This creates a symbolic link on python in tmp directory

# Helper libraries for 's instance
from w266_common import utils, vocabulary, tf_embed_viz, treeviz
from w266_common import patched_numpy_io

# Code for this assignment

import models

import nltk
from nltk.tokenize import word_tokenize

## Word Vectors

In [38]:
from pathlib import Path
import os.path
wordsList = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordsList.npy'))
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load(os.path.join(str(Path.home()), '.kaggle/wordvectors/pretrained_glove/wordVectors.npy'))

## Load Reviews

In [39]:
review_df = pd.read_csv('../data_prep/imdb_1_5_v2.csv', encoding='utf8')

## Tokenize

In [40]:
review_df['Tokens'] = review_df['Text'].apply(lambda text: word_tokenize(text.lower()))

## Train, Dev, Test data

In [41]:
# Train with 60%, , Dev: 10%, Test: 30%
train_percent = 0.6
dev_percent = 0.1
test_percent = 0.3

# Get indicies of the rows in the dataframe for training and testing
train_lower_index = 0
train_upper_index = train_lower_index + round(len(review_df)*train_percent)
dev_lower_index   = train_upper_index+1
dev_upper_index   = dev_lower_index + round(len(review_df)*dev_percent)
test_lower_index  = dev_upper_index+1
test_upper_index  = len(review_df)-1

In [42]:
maxSeqLength = 267  # Determined by EDA

In [43]:
word_ids = np.zeros([review_df.shape[0], maxSeqLength], dtype=np.int32)
word_ids_ns = np.zeros([review_df.shape[0]], dtype=np.int32)
word_ids_labels = np.zeros([review_df.shape[0]], dtype=np.int32)

In [44]:
for sentence_index, row in review_df.iterrows():
    
    word_index = 0
    
    for word in row['Tokens']:

        try:
            word_ids[sentence_index][word_index] = wordsList.index(word)
        except ValueError:
            word_ids[sentence_index][word_index] = 399999 #Vector for unkown words
        
        word_index = word_index + 1

        if word_index == maxSeqLength:
            break

    word_ids_ns[sentence_index] = word_index

    if row['Score'] > 3:
        word_ids_labels[sentence_index] = 1
    elif row['Score'] < 3:
        word_ids_labels[sentence_index] = 0

In [45]:
def Split(lower_idx, upper_idx):
    return word_ids[lower_idx:upper_idx], word_ids_ns[lower_idx:upper_idx], word_ids_labels[lower_idx:upper_idx]

In [46]:
train_x, train_ns, train_y = Split(train_lower_index, train_upper_index)
dev_x,   dev_ns,   dev_y   = Split(dev_lower_index, dev_upper_index)
test_x,  test_ns,  test_y  = Split(test_lower_index, test_upper_index)

## Model Parameters

In [47]:
import models; reload(models)

<module 'models' from '/home/matt/w266_final_project/NBOW/models.py'>

In [48]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=len(wordsList),
                    embed_dim=50,
                    hidden_dims=[25],
                    num_classes=2,      # 2 for binary classifier
                    encoder_type='bow',
                    lr=0.1,
                    optimizer='adagrad',
                    beta=0.01,
                    dropout_rate=0.1)  # fill this in

## Training Parameters

In [49]:
# Specify training schedule
train_params = dict(batch_size=32,
                    total_epochs=20,
                    eval_every=2)  # fill this in

In [50]:
checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")

if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

In [51]:
model = tf.estimator.Estimator(model_fn=models.classifier_model_fn,
                               params=model_params,
                               model_dir=checkpoint_dir)

print("\nTo view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006\n")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_tf_random_seed': None, '_service': None, '_num_ps_replicas': 0, '_session_config': None, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_global_id_in_cluster': 0, '_master': '', '_num_worker_replicas': 1, '_evaluation_master': '', '_train_distribute': None, '_save_checkpoints_steps': None, '_task_id': 0, '_is_chief': True, '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7458727eb8>, '_task_type': 'worker', '_save_summary_steps': 100, '_model_dir': '/tmp/tf_bow_sst_20180806-1124'}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20180806-1124' --port 6006

Then in your browser, open: http://localhost:6006



In [52]:
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns},
                    y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'],
                    shuffle=True,
                    seed=42)

dev_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns},
                    y=dev_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

## Train

In [53]:
for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    model.train(input_fn=train_input_fn)
    model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20180806-1124/model.ckpt.
INFO:tensorflow:step = 1, loss = 1.1352223
INFO:tensorflow:Saving checkpoints for 75 into /tmp/tf_bow_sst_20180806-1124/model.ckpt.
INFO:tensorflow:Loss for final step: 1.6921148.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-18:24:48
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1124/model.ckpt-75
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-18:24:48
INFO:tensorflow:Saving dict for global step 75: accuracy = 0.0, cross_entropy_loss = 3.1401286, gl

INFO:tensorflow:step = 526, loss = 0.55274737
INFO:tensorflow:Saving checkpoints for 600 into /tmp/tf_bow_sst_20180806-1124/model.ckpt.
INFO:tensorflow:Loss for final step: 0.20039369.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-18:25:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1124/model.ckpt-600
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-18:25:25
INFO:tensorflow:Saving dict for global step 600: accuracy = 0.005, cross_entropy_loss = 3.8428016, global_step = 600, loss = 4.0795345
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1124/model.ckpt-600
INFO:tensorflow:Running local_init_op.
IN

## Test

In [54]:
test_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns},
                    y=test_y,
                    batch_size=128,
                    num_epochs=1,
                    shuffle=False)

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-18:25:37
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20180806-1124/model.ckpt-750
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-18:25:37
INFO:tensorflow:Saving dict for global step 750: accuracy = 0.013400335, cross_entropy_loss = 3.437101, global_step = 750, loss = 3.7066135
Accuracy on test set: 1.34%
