# Paragraph Vector Model for Sentiment

In [118]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import json
import pandas as pd
import itertools
import sys
import string
from collections import defaultdict
import re
import time
import numpy as np
import matplotlib.pyplot as plt
import sys
import nltk
import random
from random import shuffle
import progressbar
from collections import namedtuple

# Gensim library
import gensim
from gensim.models import *
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

# Import SKLearn Tools
from sklearn.feature_extraction.text import *
from sklearn import svm
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report, accuracy_score

# Import tensorflow
import tensorflow as tf

# Import utils
from shared_lib import utils
reload(utils)

<module 'shared_lib.utils' from 'shared_lib/utils.py'>

## Reading Input Dataframe

In [87]:
reviewData_df = pd.read_csv("product_reviews.csv")
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId
0,4,08/15/2017,beautifullllllllllllllllllllllllllllllllllllll...,,4002178_W
1,1,05/08/2017,The actual product came out looking much diffe...,Disappointing,4002178_W
2,1,04/10/2017,These shoes look nothing like the picture! I e...,,4002178_W
3,1,02/26/2017,I ordered this shoe because i loved the displa...,color sample was way off,4002178_W
4,5,09/15/2017,They are comfortable sneakers for working out ...,awesome sneakers,4002179_W


In [88]:
reviewData_df['reviewComments'] = reviewData_df['reviewComments'].apply(utils.preprocess_doc)
reviewData_df['label'] = reviewData_df.apply(utils.create_senti_label_num, axis = 1)
reviewData_df.head()

Unnamed: 0,rating,ratingDate,reviewComments,reviewTitle,modelId,label
0,4,08/15/2017,beautiful,,4002178_W,0
1,1,05/08/2017,actual product came looking much different onl...,Disappointing,4002178_W,2
2,1,04/10/2017,shoes look nothing like picture expected grey ...,,4002178_W,2
3,1,02/26/2017,ordered shoe loved displayed blush pink color ...,color sample was way off,4002178_W,2
4,5,09/15/2017,comfortable sneakers working running,awesome sneakers,4002179_W,0



## paragraph vector model

### labeled sentence data structure

In [89]:
#Helper function to convert newsgroup corpus into paragraph2Vec formats
def convert_reviews(docs, split):
    #global doc_count
    tagged_documents = []
    
    for i,v in enumerate(docs):
        label = '%s_%s'%(split, i)
        tagged_documents.append(LabeledSentence(v, [label]))
    
    return tagged_documents

### Create train (80%), validation (20%), and test split (20%)

In [90]:
all_reviews = reviewData_df.loc[:, 'reviewComments']
all_labels = reviewData_df.loc[:, 'label']

# Split for test set
train_docs, train_labels, test_docs, test_labels = utils.get_train_test_docs(all_reviews, 
                                                                             all_labels, 
                                                                             split = 0.8, 
                                                                             shuffle = True)

# Further split for validation
train_docs, train_labels, validation_docs, validation_labels = utils.get_train_val_docs(train_docs, 
                                                                                        train_labels, 
                                                                                        split = 0.75, 
                                                                                        shuffle = True)

Loaded 4288 docs (434214 tokens)
Training set: 3430 docs (352393 tokens)
Test set: 858 docs (81821 tokens)
Loaded 3430 docs (352393 tokens)
Training set: 2572 docs (262842 tokens)
Validation set: 858 docs (89551 tokens)


In [91]:
## Convert to paragraph2vec library input format
train_docs = convert_reviews(train_docs,'train')
validation_docs = convert_reviews(validation_docs,'validation')
test_docs = convert_reviews(test_docs, 'test')

## We combine all data to build the paragraph vector model
all_docs = []
all_docs.extend(train_docs)
all_docs.extend(validation_docs)
all_docs.extend(test_docs)
doc_list = all_docs[:]  # for reshuffling per pass

print '{} docs: {} train, {} validation, {} test'.format(len(doc_list), len(train_docs), \
                                                          len(validation_docs), len(test_docs))

4288 docs: 2572 train, 858 validation, 858 test


### Define Paragraph Vector model

* Use feature size of 100

In [92]:
# dm = 0 use distributed bag of words, dm=1 distributed memory
dm_model = Doc2Vec(dm=1, dm_mean=1, sample=1e-5, size=100, window=10, 
                   negative=5, hs=0, min_count=2, workers=10, max_vocab_size = 20000)
dm_model.build_vocab(all_docs)

In [93]:
alpha, min_alpha, passes = (0.025, 0.001, 100)
alpha_delta = (alpha - min_alpha) / passes
bar = progressbar.ProgressBar()

for epoch in bar(range(passes)):
    shuffle(doc_list)
    dm_model.alpha, dm_model.min_alpha = alpha, alpha
    dm_model.train(doc_list, total_examples=len(doc_list))
    alpha -= alpha_delta

100% |########################################################################|


### Extract features vectors from the model

In [94]:
#Get Vectors From Doc2Vec
def extract_vectors(model, docs):    
    vectors_list = []
    for doc_no in range(len(docs)):
        doc_label = docs[doc_no].tags[0] # Use tag to id
        doc_vector = model.docvecs[doc_label]
        vectors_list.append(doc_vector)      
    return vectors_list

# TODO inferred vectors
def get_infer_vectors(model,docs):   
    vecs = []
    for doc in docs:
        vecs.append(model.infer_vector(doc.words))
    return vecs

In [96]:
train_docs_ids = np.array(extract_vectors(dm_model,train_docs))
validation_docs_ids = np.array(extract_vectors(dm_model,validation_docs))
test_docs_ids = np.array(extract_vectors(dm_model,test_docs))

# Convert label to one-hot-code
num_class = 3 # pos, neg, neu
train_labels_oh = np.eye(num_class)[train_labels]
validation_labels_oh = np.eye(num_class)[validation_labels]
test_labels_oh = np.eye(num_class)[test_labels]

print "Training doc shape: {}".format(train_docs_ids.shape)
print "Validation doc shape: {}".format(validation_docs_ids.shape)
print "Test doc shape: {}".format(test_docs_ids.shape)
print "Training label shape: {}".format(train_labels_oh.shape)
print "Validation label shape: {}".format(validation_labels_oh.shape)
print "Test label shape: {}".format(test_labels_oh.shape)

Training doc shape: (2572, 100)
Validation doc shape: (858, 100)
Test doc shape: (858, 100)
Training label shape: (2572, 3)
Validation label shape: (858, 3)
Test label shape: (858, 3)


## Build a Tensorflow softmax output layer

In [99]:
num_classes = num_class
feature_size = 100
l2_reg_lambda = 0.0

In [104]:
# x_ is the Paragraph2vec input vectors
x_ = tf.placeholder(tf.float32, [None, feature_size], name="x")
y_ = tf.placeholder(tf.float32, [None, num_classes], name="y")
l2_loss = tf.constant(0.0)

# Output Layer: Softmax (1 affine layer)
with tf.name_scope("Output_layer"):
    Z_ = tf.Variable(tf.random_uniform([feature_size, num_classes], -1.0, 1.0), name = "Z")
    b_ = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
    logits_ = tf.add(tf.matmul(x_, Z_), b_, name="logits")
    
    # L2 Regularization
    l2_loss += tf.nn.l2_loss(Z_)
    l2_loss += tf.nn.l2_loss(b_)
    predictions_ = tf.argmax(logits_, 1, name="predictions")

    
# Calculate mean cross-entropy loss
with tf.name_scope("cost_function"):
    per_example_losses_ = tf.nn.softmax_cross_entropy_with_logits(logits=logits_, 
                                                                 labels=y_,
                                                                 name="per_example_loss")
    loss_ = tf.reduce_mean(per_example_losses_) + l2_reg_lambda * l2_loss

# Accuracy
with tf.name_scope("accuracy"):
    correct_predictions_ = tf.equal(predictions_, tf.argmax(y_, 1))
    accuracy_ = tf.reduce_mean(tf.cast(correct_predictions_, "float"), name="accuracy")
    
with tf.name_scope("Training"):
    alpha_ = tf.placeholder(tf.float32, name="learning_rate")
    optimizer_ = tf.train.AdagradOptimizer(alpha_)
    #optimizer_ = tf.train.AdamOptimizer(alpha_)
    train_step_ = optimizer_.minimize(loss_)

In [105]:
# Helper functions for training
def batch_generator(data, batch_size):
    """Generate minibatches from data."""
    for i in xrange(0, len(data), batch_size):
        yield data[i:i+batch_size]
        
def train_batch(session, batch, alpha):
    # Feed last column as targets
    feed_dict = {x_:train_docs_ids,
                 y_:train_labels_oh,
                 alpha_:alpha}
    c, a, pred, _ = session.run([loss_, accuracy_, predictions_, train_step_],
                       feed_dict=feed_dict)
    return c, a, pred        

def validation_batch(session):
    feed_dict = {x_:validation_docs_ids,
                 y_:validation_labels_oh}
    a, pred = session.run([accuracy_, predictions_, ], feed_dict=feed_dict)
    return a, pred

def predict_batch(session):
    feed_dict = {x_:test_docs_ids,
                 y_:test_labels_oh}
    a, pred = session.run([accuracy_, predictions_], feed_dict=feed_dict)
    return a, pred

In [124]:
def run_epochs(num_epochs, learning_rate, batch_size = 100, min_rate = 0.1, print_freq = 10, seed = 42):
    # One epoch = one pass through the training data
    num_epochs = num_epochs
    batch_size = batch_size
    alpha = learning_rate  # learning rate
    min_alpha = min_rate
    alpha_delta = (alpha - min_alpha) / num_epochs
    print_every = print_freq

    # Initializer step
    init_ = tf.global_variables_initializer()
    
    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()
    
    np.random.seed(seed)
    
    # For plotting
    train_accuracy = []
    validation_accuracy = []

    session = tf.Session()
    session.run(init_)

    t0 = time.time()
    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        epoch_cost = 0.0
        total_batches = 0
        print ""
        print "----------- Training ------------"
        for i, batch in enumerate(batch_generator(train_docs_ids, batch_size)):
            if (i % print_every == 0):
                print "[epoch %d] seen %d minibatches" % (epoch, i)

            cost, accuracy, pred = train_batch(session, batch, alpha)
            epoch_cost += cost
            total_batches = i + 1

        avg_cost = epoch_cost / total_batches
        alpha = alpha - alpha_delta

        print "[epoch %d] Completed %d minibatches in %s" % (epoch, i, utils.pretty_timedelta(since=t0_epoch))
        print "[epoch %d] Average cost: %.03f" % (epoch, avg_cost,)
        print "[epoch %d] Accuracy %.03f" %(epoch, accuracy)
        print "[epoch %d] Training Classificaiton Report\n" %(epoch)
        print pred.shape
        print classification_report(train_labels, pred)
        print "Training Vector Accuracy: ", accuracy_score(train_labels, pred)
        train_accuracy.append(accuracy)
        
        print ""
        print "---------- Validation ----------"
        accuracy, pred = validation_batch(session)
        print "[epoch %d] Validation Accuracy is %.03f" %(epoch, accuracy)
        print "[epoch %d] Validation Classificaiton Report\n" %(epoch)
        print classification_report(validation_labels, pred)
        print "Validation Vector Accuracy:", accuracy_score(validation_labels, pred)
        validation_accuracy.append(accuracy)        

    # Save the variables to disk.
    save_path = saver.save(session, "./saved_model/p2v_model")
    print("Model saved in file: %s" % save_path)
    
    return train_accuracy, validation_accuracy, session


In [131]:
train_accur, validation_accur, sess = run_epochs(num_epochs = 10, 
                                                 batch_size = 100, 
                                                 learning_rate = 0.5, 
                                                 min_rate = 0.1, 
                                                 print_freq = 100)


----------- Training ------------
[epoch 1] seen 0 minibatches
[epoch 1] Completed 25 minibatches in 0:00:00
[epoch 1] Average cost: 0.478
[epoch 1] Accuracy 0.885
[epoch 1] Training Classificaiton Report

(2572,)
             precision    recall  f1-score   support

          0       0.88      1.00      0.94      2275
          1       0.00      0.00      0.00       211
          2       0.00      0.00      0.00        86

avg / total       0.78      0.88      0.83      2572

Training Vector Accuracy:  0.884525660964

---------- Validation ----------
[epoch 1] Validation Accuracy is 0.871
[epoch 1] Validation Classificaiton Report

             precision    recall  f1-score   support

          0       0.87      1.00      0.93       747
          1       0.00      0.00      0.00        78
          2       0.00      0.00      0.00        33

avg / total       0.76      0.87      0.81       858

Validation Vector Accuracy: 0.870629370629

----------- Training ------------
[epoch 2] se

Model saved in file: ./saved_model/p2v_model


In [138]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

clf_svc = OneVsRestClassifier(SVC(probability=True))
clf_svc.fit(train_docs_ids, train_labels)

parameters_SVC = {
    "estimator__C": [0.001,0.01,0.1,0.5,1,2,5],
    "estimator__kernel": ["poly","rbf", "sigmoid","linear"],
    "estimator__degree": [1, 2, 3, 4]
}

print "--- 1-vs-Rest SVM ---"
mod_svc = GridSearchCV(estimator=clf_svc, param_grid=parameters_SVC, scoring='f1_macro', verbose=True)
mod_svc.fit(validation_docs_ids, validation_labels)
print mod_svc.best_score_ 
print mod_svc.best_params_

--- 1-vs-Rest SVM ---
Fitting 3 folds for each of 112 candidates, totalling 336 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 336 out of 336 | elapsed:   58.8s finished


0.310280373832
{'estimator__kernel': 'poly', 'estimator__C': 0.001, 'estimator__degree': 1}


In [139]:
clf_svc = OneVsRestClassifier(SVC(probability=True, 
                              kernel=mod_svc.best_params_['estimator__kernel'], 
                              C=mod_svc.best_params_['estimator__C'], 
                              degree=mod_svc.best_params_['estimator__degree']))

clf_svc.fit(train_docs_ids, train_labels)
pred = clf_svc.predict(test_docs_ids)

In [140]:
print classification_report(test_labels, pred)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94       755
          1       0.00      0.00      0.00        76
          2       0.00      0.00      0.00        27

avg / total       0.77      0.88      0.82       858

