In [782]:
import numpy as np
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

%matplotlib inline

First of all, let's upload the data and have a look at it. 

In [767]:
df_train = pd.read_csv('Reviews.csv')
df_train.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Now, as we want to predict positive and negative reviews. I am going to create a new columns with either 'Positive' or 'Negative' as values. Before that, I am going to delete the rows where the score is 3, as this is neither positive nor negative. 

In [768]:
df_train = df_train[df_train.Score != 3]
df_train['Target'] = 'Pos'
df_train['Target'][df_train.Score < 3] = 'Neg'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


We won't be needing most of the columns, so let's get rid of them.

In [769]:
df_train = df_train.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator',
                         'Score', 'Time'], 1)

I am going to merge both the 'Summary' and 'Text' column into one column called 'Finaltext'. Then I am going to make sure that Python will not complain later about not being a String. 

In [770]:
df_train['Finaltext'] = df_train.Summary.str.cat(df_train.Text, sep = ' . ')
df_train.Finaltext = df_train.Finaltext.astype(str)

As my computer is not the most powerful computer in the world, I am going to work with only 250K of the entries. Feel free to skip this step.

In [771]:
df_train_final = df_train[:250000]

I am going to separate our data into two datasets: one with all the Positive entries and one with all the negative entries. I will use these two datasets later. Also, note how our classes are unbalanced. 

In [772]:
df_pos = df_train_final[df_train_final.Target == 'Pos']
df_neg = df_train_final[df_train_final.Target == 'Neg']

In [775]:
print('Positive entries:', len(df_pos), 'Negative entries:', len(df_neg))

Positive entries: 210615 Negative entries: 39385


Now I am going to code a function that I will use to create the lexicon. The words in this lexicon will be the features in our future dataset. 

To help cleaning the data we will use:

* Stop words: get rid of the most common words that will not help us in our predictions.
* Lemmatizer: this function from nltk is helpful to merge similar words.
* Exclude the punctuation signs. 

One importat thing to notice:

I am excluding the most popular words and the least popular words. As you can see in the code below, I am keeping the words that appear at least once every 150 entries. You can play with this number, the smaller it is, the fewer features you will get and probably less accuracy later, but it also means a decrease in the running time. 

In [776]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
exclude = set(string.punctuation)

def create_lexicon(data):
    lexicon = []
    for lines in data:
        if type(lines) is str:
            words = word_tokenize(lines.lower())
            lexicon += [w for w in words if w not in [stop_words,exclude]]
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    word_count = Counter(lexicon)
    final_lexicon = []
    for word in word_count:
        if (len(data) / 2) > word_count[word] > (len(data)/150):
            final_lexicon.append(word)
    return final_lexicon, word_count

Now let's create our lexicon. This step might take a while. 

In [777]:
lexicon_amazon, word_count_amazon = create_lexicon(df_train_final.Finaltext)

As you can see, we have 1157 words in our lexicon. 

In [778]:
len(lexicon_amazon)

1157

In [779]:
print(lexicon_amazon[:20])

['good', 'quality', 'dog', 'food', 'bought', 'several', 'canned', 'product', 'found', 'them', 'all', 'be', 'look', 'more', 'like', 'than', 'meat', 'smell', 'better', 'she']


The following function will take the positive entries, the negative entries, the lexicon and then create a combined dataset with all the features. 

In [780]:
def create_dataset(pos,neg,lexicon):
    dataset = []
    for lp in pos:
        words_pos = word_tokenize(lp)
        if type(words_pos) is str:
            words_pos = words_pos.lower()
        words_pos = [lemmatizer.lemmatize(i) for i in words_pos]
        features_pos = np.zeros(len(lexicon) + 2)
        features_pos[-1] = 1
        for word in words_pos:
            if word in lexicon:
                index = lexicon.index(word)
                features_pos[index] += 1
        dataset.append(features_pos)
    
    for ln in neg:
        words_neg = word_tokenize(ln)
        if type(words_neg) is str:
            words_neg = words_neg.lower()
        words_neg = [lemmatizer.lemmatize(i) for i in words_neg]
        features_neg = np.zeros(len(lexicon) + 2)
        features_neg[-1] = 0
        for word in words_neg:
            if word in lexicon:
                index = lexicon.index(word)
                features_neg[index] += 1
        dataset.append(features_neg)
    dataset = np.array(dataset)
    np.random.shuffle(dataset)
    return dataset

Let's create our final dataset! This step will definitely take a while. 

In [781]:
data = create_dataset(df_pos.Finaltext, df_neg.Finaltext, lexicon_amazon)

Now we divide the data into X, y and T. y is the target and T is the target as one hot encoding. 

In [783]:
X = data[:, :-1]
y = data[:,-1]

def y2indicator(y):
    N = len(y)
    y = y.astype(np.int32)
    ind = np.zeros((N, 2))
    for i in range(N):
        ind[i, y[i]] = 1
    return ind

T = y2indicator(y)

Let's divide the X, y and T into training and test sets. 

In [784]:
X_train, X_test, y_train, y_test, T_train, T_test = train_test_split(X, y, T, test_size=0.2, random_state=42)

Before coding the Neural Network, we can play with several 'simpler' algorithms:

In [785]:
lvc = LinearSVC()
lvc.fit(X_train, y_train)
lvc.score(X_test, y_test)

0.90834000000000004

In [786]:
mnbmod = MultinomialNB()
mnbmod.fit(X_train, y_train)
mnbmod.score(X_test, y_test)

0.88544

In [787]:
gnbmod = GaussianNB()
gnbmod.fit(X_train, y_train)
gnbmod.score(X_test, y_test)

0.82179999999999997

In [788]:
bnbmod = BernoulliNB()
bnbmod.fit(X_train, y_train)
bnbmod.score(X_test, y_test)

0.85633999999999999

In [642]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
logistic.score(X_test, y_test)

0.88983333333333337

This is the code for the Neural Network. I explain it inside. 

In [789]:
#First, we create this function to calculate the accuracy. 

def accuracy(p, t):
    accuracy = np.mean(p == t)
    return accuracy

#Now let's define some parameters of our NN. You can defintely play with these to improve the speed and accuracy. 

max_iter = 6
print_period = 50
lr = 0.00002
reg = 0.001

N, D = X_train.shape
batch_sz = 1500
n_batches = int(N / batch_sz)

#I will be using one NN with 3 layers of 500 hidden nodes each. 

M1 = 500
M2 = 500
M3 = 500
K = 2

#These are the values to initialize the weights and biases. 

W1_init = np.random.randn(D, M1) / np.sqrt(N)
b1_init = np.zeros(M1)
W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
b2_init = np.zeros(M2)
W3_init = np.random.randn(M2, M3) / np.sqrt(M2)
b3_init = np.zeros(M3)
W4_init = np.random.randn(M3, K) / np.sqrt(M3)
b4_init = np.zeros(K)

#And now these are the tf variables. 

X = tf.placeholder(tf.float32, shape=(None, D), name='X')
T = tf.placeholder(tf.float32, shape=(None, K), name='T')
W1 = tf.Variable(W1_init.astype(np.float32))
b1 = tf.Variable(b1_init.astype(np.float32))
W2 = tf.Variable(W2_init.astype(np.float32))
b2 = tf.Variable(b2_init.astype(np.float32))
W3 = tf.Variable(W3_init.astype(np.float32))
b3 = tf.Variable(b3_init.astype(np.float32))
W4 = tf.Variable(W4_init.astype(np.float32))
b4 = tf.Variable(b4_init.astype(np.float32))

#These are the activation values. I am using relu but this can be changed as well. 

Z1 = tf.nn.relu( tf.matmul(X, W1) + b1 )
Z2 = tf.nn.relu( tf.matmul(Z1, W2) + b2 )
Z3 = tf.nn.relu( tf.matmul(Z2, W3) + b3 )
Yish = tf.matmul(Z3, W4) + b4 


#This is our cost function, with will use Softmax. 

cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))

#This line is for the optimizer. I am using RMSProp as it allows for momentum, but feel free to change it as well. 

train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)

#This is our prediction line

predict_op = tf.argmax(Yish, 1)

#And now we can start!

costs = []
init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)

    for i in range(0,max_iter):
        for j in range(0,n_batches):
            Xbatch = X_train[j*batch_sz:(j*batch_sz + batch_sz),:]
            Ybatch = T_train[j*batch_sz:(j*batch_sz + batch_sz),:]

            session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
            if j % print_period == 0:
                test_cost = session.run(cost, feed_dict={X: X_test, T: T_test})
                prediction = session.run(predict_op, feed_dict={X: X_test})
                acc = accuracy(prediction, y_test)
                print ("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, acc))
                costs.append(test_cost)

Cost / err at iteration i=0, j=0: 34529.641 / 0.810
Cost / err at iteration i=0, j=50: 18442.072 / 0.842
Cost / err at iteration i=0, j=100: 13726.021 / 0.872
Cost / err at iteration i=1, j=0: 12387.218 / 0.902
Cost / err at iteration i=1, j=50: 11408.704 / 0.909
Cost / err at iteration i=1, j=100: 10991.662 / 0.912
Cost / err at iteration i=2, j=0: 10900.015 / 0.912
Cost / err at iteration i=2, j=50: 10676.706 / 0.914
Cost / err at iteration i=2, j=100: 10434.529 / 0.916
Cost / err at iteration i=3, j=0: 10364.512 / 0.916
Cost / err at iteration i=3, j=50: 10103.355 / 0.919
Cost / err at iteration i=3, j=100: 9790.184 / 0.922
Cost / err at iteration i=4, j=0: 9652.279 / 0.924
Cost / err at iteration i=4, j=50: 9553.721 / 0.925
Cost / err at iteration i=4, j=100: 9130.189 / 0.929
Cost / err at iteration i=5, j=0: 9214.127 / 0.931
Cost / err at iteration i=5, j=50: 9502.551 / 0.929
Cost / err at iteration i=5, j=100: 9595.035 / 0.934


In [790]:
print(classification_report(prediction, y_test))

             precision    recall  f1-score   support

          0       0.73      0.83      0.78      6977
          1       0.97      0.95      0.96     43023

avg / total       0.94      0.93      0.94     50000

