# This notebook implements the BOW model with Ordinal Output Categories 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib as plt 
import glob
from importlib import reload

import os, sys, re, json, time, datetime, shutil
from common import utils, constants, spell

import tensorflow as tf
import tripadvisor_ds
import visualization

try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

## 1. Load tripadvisor data 

In [46]:

reload(tripadvisor_ds)

input_length = 500
max_bytes = 2**31 - 1

data_file = 'data/tripadvisor_ds.pkl'

if os.path.isfile(data_file):

    bytes_in = bytearray(0)
    input_size = os.path.getsize(data_file)
    with open(data_file, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    ds = pickle.loads(bytes_in)
        
else:
    ds = tripadvisor_ds.TripAdvisor_DS().process(input_length=input_length)
    ds.save(data_file)


In [47]:
# -----------------------------------------
# convert the output to Ordinal Categories 
# 2-star rating: [0,0,0]
# 3-star rating: [1,0,0]
# 4-star rating: [1,1,0]
# 5-star rating: [1,1,1]
# -----------------------------------------

ds.get_ord_labels()

In [50]:
labels = pd.DataFrame({'rating': ds.train_labels})
labels.rating.value_counts()

3    413463
2    231827
1    106079
0     90053
Name: rating, dtype: int64

## 2. Build the Model

In [135]:
# set model parameters 
model_params = dict(V=ds.vocab.size, 
                    embed_dim=100, 
                    num_classes=len(ds.target_labels),
                    encoder_type='bow', 
                    hidden_dims=[1024, 64], 
                    input_length=input_length,
                    lr=0.0001, 
                    optimizer='adam', 
                    beta=0.00001)
                    
train_params = dict(batch_size=64, 
                    total_epochs=20, 
                    eval_every=2)


summary_params = dict(chkpt_dir="./tmp/266_bow_ord" + datetime.datetime.now().strftime("%Y%m%d-%H%M"))

In [137]:
# setup tensorboard 

if os.path.isdir(summary_params['chkpt_dir']):
    shutil.rmtree(summary_params['chkpt_dir'])

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)


In [138]:
def embedding_layer(ids_, V, embed_dim, init_scale=0.001):
    
    # prepare vocabulary  
    W_embed_ = tf.get_variable("W_embed", shape=[V, embed_dim], \
                               initializer=tf.random_uniform_initializer(-init_scale, init_scale), \
                               trainable=True)
        
    # look up word embedding 
    xs_ = tf.nn.embedding_lookup(W_embed_, ids_, name="embed_x")
        
    return xs_

def fully_connected_layers(h0_, hidden_dims, activation=tf.nn.relu,
                           dropout_rate=0, is_training=False):
    h_ = h0_
    for i, hdim in enumerate(hidden_dims):
        h_ = tf.layers.dense(h_, hdim, activation=activation, name=("Hidden_%d"%i))
        if dropout_rate > 0:
            h_ = tf.layers.dropout(h_, rate=dropout_rate, training=is_training )

    return h_

def softmax_output_layer(h_, labels_, num_classes):
    
    W_out_ = tf.get_variable("W_out",  shape=[h_.get_shape().as_list()[1], num_classes], \
                               initializer=tf.random_normal_initializer())
    b_out_ = tf.get_variable("b_out", shape=[num_classes])

    logits_ = tf.add(tf.matmul(h_, W_out_), b_out_)
        
    if labels_ is None:
        return None, logits_
    
    with tf.variable_scope("Softmax_Layer"):

        softmax_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_, logits=logits_)
        
        loss_ = tf.reduce_mean(softmax_)
    
    return loss_, logits_


def sigmoid_output_layer(h_, labels_, num_classes):
    
    W_out_ = tf.get_variable("W_out",  shape=[h_.get_shape().as_list()[1], num_classes], \
                               initializer=tf.random_normal_initializer())
    b_out_ = tf.get_variable("b_out", shape=[num_classes])

    logits_ = tf.add(tf.matmul(h_, W_out_), b_out_)
        
    if labels_ is None:
        return None, logits_
    
    with tf.variable_scope("Sigmoid_Layer"):
        
        sigmoid_ = tf.nn.sigmoid(logits_)
        
        loss_ = num_classes * tf.reduce_mean(tf.squared_difference( labels_, sigmoid_))
        
    return loss_, logits_


def BOW(ids_, V, embed_dim, hidden_dims, dropout_rate=0, is_training=None):
    assert is_training is not None, "is_training must be explicitly set to True or False"

    with tf.variable_scope("Embedding_Layer"):
        xs_ = embedding_layer(ids_, V, embed_dim)
     
    sum_xs_ = tf.reduce_sum(xs_, 1)

    h_ = fully_connected_layers(sum_xs_, hidden_dims, \
                           dropout_rate=dropout_rate, is_training=is_training)
    return h_, xs_


def conv_net(ids_, V, embed_dim, filter_sizes, num_filters, hidden_dims, input_length, dropout_rate=0, is_training=None):

    assert is_training is not None, "is_training must be explicitly set to True or False"

    with tf.variable_scope("Embedding_Layer"):
        xs_ = embedding_layer(ids_, V, embed_dim)

    xs_ = tf.expand_dims(xs_, -1)
        
    pooled_outputs_ = []
    for _, filter_size in enumerate(filter_sizes):
        with tf.name_scope("Conv_MaxPool_%d"%filter_size):
            
            # Convolution Layer
            filter_shape = [filter_size, embed_dim, 1, num_filters]
            W_ = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b_ = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv_ = tf.nn.conv2d(
                xs_,
                W_,
                strides=[1, 1, 1, 1],
                padding="VALID",
                name="conv")
            
            # Activation
            h_ = tf.nn.relu(tf.nn.bias_add(conv_, b_), name="relu")
            
            # Maxpooling 
            pooled_ = tf.nn.max_pool(
                h_,
                ksize=[1, input_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
            pooled_outputs_.append(pooled_)
            
            variable_summaries(pooled_)

    # Combine all the pooled features and flatten it
    num_filters_total = num_filters * len(filter_sizes)
    h_ = tf.concat(pooled_outputs_, 3)
    h_ = tf.reshape(h_, [-1, num_filters_total])
    
    # fully connected layers
    with tf.variable_scope("FC_Layer"):
        h_ = fully_connected_layers(h_, hidden_dims, is_training = is_training)

    return h_, xs_


In [139]:
tf.reset_default_graph() 

X = tf.placeholder(tf.int32, [None, input_length], name='input_x')
Y = tf.placeholder(tf.float32, [None, 3], name='input_y')
    
if model_params['encoder_type'] == 'bow':
    h_, xs_ = BOW(X, model_params['V'], 
                      model_params['embed_dim'],  
                      model_params['hidden_dims'],
                      is_training=True)


with tf.variable_scope("Output_Layer"):
    loss_, logits_ = sigmoid_output_layer(h_, Y, model_params['num_classes'] -1)
    

with tf.name_scope("Prediction"):
    pred_proba_ = tf.nn.sigmoid(logits_, name="pred_proba")
    
    pred_max_ = tf.reduce_sum(tf.cast(tf.great(pred_proba_ ,0.5), tf.int32), axis = 1, name="pred_max")
    
    predictions_dict = {"proba": pred_proba_, "max": pred_max_}

with tf.variable_scope("Regularization"):
    l2_penalty_ = tf.nn.l2_loss(xs_)  # l2 loss on embeddings
    for var_ in tf.trainable_variables():
        if "Embedding_Layer" in var_.name:
            continue
        l2_penalty_ += tf.nn.l2_loss(var_)
    l2_penalty_ *= model_params['beta']  # scale by regularization strength
    tf.summary.scalar('l2_penalty', l2_penalty_)
    regularized_loss_ = loss_ + l2_penalty_
    tf.summary.scalar('regularized_loss', regularized_loss_)

with tf.variable_scope("Training"):
    if model_params['optimizer'] == 'adagrad':
        optimizer_ = tf.train.AdagradOptimizer(model_params['lr'])
    elif  model_params['optimizer'] == 'adam':
        optimizer_ = tf.train.AdamOptimizer(model_params['lr'])
    else:
        optimizer_ = tf.train.GradientDescentOptimizer(model_params['lr'])
    train_op_ = optimizer_.minimize(regularized_loss_,
                    global_step=tf.train.get_global_step())


with tf.name_scope("Evaluation"):

    correct_pred_ = tf.equal(tf.cast(pred_max_, tf.int32), tf.cast(tf.reduce_sum(Y, 1), tf.int32))
    accuracy_ = tf.reduce_mean(tf.cast(correct_pred_, tf.float32))

    tf.summary.scalar('loss', loss_)
    tf.summary.scalar('accuracy', accuracy_)



## 3. Train the Model 

In [None]:

# start session
sess = tf.Session()
graph = tf.get_default_graph()

# Tensorboard - Visualize graph 
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(summary_params['chkpt_dir'] + '/train', sess.graph)
test_writer = tf.summary.FileWriter(summary_params['chkpt_dir'] + '/test')

print("tensorboard --logdir={}/train".format(summary_params['chkpt_dir']))
print("tensorboard --logdir={}/test".format(summary_params['chkpt_dir']))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()

# Run the initializer
sess.run(init)
sess.run(init_l)

total_batches = 0
total_examples = 0
total_loss = 0
loss_ema = np.log(2)  # track exponential-moving-average of loss
ema_decay = np.exp(-1/10)  # decay parameter for moving average = np.exp(-1/history_length)



for i in range(train_params['total_epochs']):
    t0 = time.time()

    train_batches = 1
    train_accuracy = 0.0
    
    for (bx, by) in utils.multi_batch_generator(train_params['batch_size'], \
                                        ds.padded_train_features, ds.train_ord_labels):

        summary, batch_loss, _, batch_accuracy, pred_proba, pred_max = sess.run(
            [merged, regularized_loss_, train_op_, accuracy_, pred_proba_, pred_max_], feed_dict={X: bx, Y: by})
        
        #print(pred_proba)
        #print(pred_max)
        
        train_batches +=1
        train_accuracy += batch_accuracy
        
        # Compute some statistics
        total_batches += 1
        total_examples += len(bx)
        total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean

        # Compute moving average to smooth out noisy per-batch loss
        loss_ema = ema_decay * loss_ema + (1 - ema_decay) * batch_loss
        
        if (total_batches % 25 == 0):
            print("{:5,} examples, moving-average loss {:.2f}, train accuracy {:.2f}"\
                  .format(total_examples, loss_ema, train_accuracy/train_batches))    
            
        train_writer.add_summary(summary, total_batches)

    print("Completed {} epoch in {:s}".format(i, utils.pretty_timedelta(since=t0)))
    
    train_accuracy = train_accuracy/train_batches
    print("Train accurary:{:.5f}".format(train_accuracy))
    
    
    # run the validation dataset 
    validate_batches = 1
    validate_accuracy = 0.0
    for (vx, vy) in utils.multi_batch_generator(train_params['batch_size'], \
                                            ds.padded_validate_features, ds.validate_ord_labels):

        summary, batch_accuracy = sess.run([merged, accuracy_], feed_dict={X: vx, Y: vy})

        validate_batches +=1
        validate_accuracy += batch_accuracy

        test_writer.add_summary(summary, total_batches + validate_batches)

    validate_accuracy = validate_accuracy/validate_batches
    print("Validate accuracy:{:.5f}".format(validate_accuracy))
        

## 5. Evaluate the Model 

In [142]:
test_batches = 1
test_accuracy = 0.0
test_pred_y = []

for (tx, ty) in utils.multi_batch_generator(train_params['batch_size'], \
                                        ds.padded_test_features, ds.test_ord_labels):

    batch_accuracy, pred_max = sess.run([accuracy_, pred_max_], feed_dict={X: tx, Y: ty})

    test_batches +=1
    test_accuracy += batch_accuracy
    test_pred_y.append(pred_max.tolist())

test_accuracy = test_accuracy/test_batches
print("Test accuracy:{}".format(test_accuracy))

pred_y = [y for x in test_pred_y for y in x]

Test accuracy:0.7119574175981375


In [143]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ds.test_labels, pred_y)

array([[ 7718,  2946,   257,    35],
       [ 1593,  7844,  3351,   243],
       [  224,  3233, 16614,  8606],
       [   65,   561,  8754, 41835]])

In [2]:
#op = sess.graph.get_operations()
#[m.values() for m in op]

