## Sentiment Analysis on Amazon Unlocked Phone Purchase Review data. Using Naive Bayes and LSTM to find the accuracy of sentiment analysis with training and testing models.

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

from bs4 import BeautifulSoup  
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

import logging
from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from collections import defaultdict
from keras.layers.convolutional import Convolution1D
from keras import backend as K
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [33]:
#load csv file
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0
9,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,3,It's battery life is great. It's very responsi...,0.0


## Prep Data

In [20]:
df.dropna(inplace=True) #drop null 
df = df[df['Rating'] != 3] #drop neutral rating

#encode 4,5 as 1 for positive sentiment & 1,2 as 0 for negative sentiment
df['Sentiment'] = np.where(df['Rating'] > 3, 1, 0)
print(df.head())

#split training and test 
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Sentiment'], \
                                                    test_size=0.1, random_state=0)

print('Load %d training examples and %d validation examples. \n' %(X_train.shape[0],X_test.shape[0]))
print('Show a review in the training set : \n', X_train.iloc[10])

#sample less from positive for testing 
#for training repeat negative ones and put them twice and use half of the positive 

                                             Product Name Brand Name    Price  \
134801  BLU Studio 5.0 C HD - Unlocked Cell Phones - R...        BLU  2000.00   
123493                         Blu LIFE 8 Unlocked (Pink)        BLU   199.98   
335592  Samsung Galaxy S Duos II S7582 DUAL SIM Factor...    Samsung   299.99   
246353  Motorola Droid 2 A955 Verizon Phone 5MP Cam, W...   Motorola    82.00   
273324  Nokia Lumia 920 32GB Unlocked GSM 4G LTE Windo...      Nokia   149.35   

        Rating                                            Reviews  \
134801       5  For the price I paid for this devices, its fan...   
123493       5  love love love it....good buy...recommend to a...   
335592       4                                               Good   
246353       1  Not good. Returned first phone and they sent m...   
273324       4  Met expectations! I'm very satisfied!Even arri...   

        Review Votes  Sentiment  
134801           0.0          1  
123493           0.0          

## bag of words
#### 1. find a word embedding to convert a text into a numerical representation. 
#### 2. fit the numerical representations of text to machine learning algorithms or deep learning architectures.

1. preprocess raw & create clean reviews
2. create bag of words using count vectorizor
3. get feature vectors for each review
4. fit feature vectors

In [5]:
#cleaning done is: remove html tag, remove special char & num, make lowercase, remove stop words, stemming
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False, \):
    text = BeautifulSoup(raw_text, 'lxml').get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = letters_only.lower().split()
    if remove_stopwords: 
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if stemming==True:
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
    if split_text==True: 
        return (words)
    return( " ".join(words)) 

X_train_cleaned = []
X_test_cleaned = []
for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in the training set : ',  X_train_cleaned[10])
for d in X_test:
    X_test_cleaned.append(cleanText(d))

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Show a cleaned review in the training set : 
 good product and fast shipping thank you


In [6]:
#fit & transform  training data to doc-term matrix withwo CountVectorizer
countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(countVect.get_feature_names())) # there are 6378 
print("Show some feature names : \n", countVect.get_feature_names()[::1000])

mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

Number of features : 19607 

Show some feature names : 
 ['aa', 'areable', 'boot', 'clean', 'crushing', 'distortions', 'excatly', 'frills', 'heart', 'inverter', 'lolit', 'movie', 'over', 'predictable', 'reconnecting', 'scaling', 'soldto', 'tapped', 'ubuntu', 'wedges']


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
#print model eval predicted res
def modelEvaluation(predictions):
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))
predictions = mnb.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.9184

AUC score : 0.8790

Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.80      0.83       778
           1       0.93      0.96      0.95      2311

    accuracy                           0.92      3089
   macro avg       0.90      0.88      0.89      3089
weighted avg       0.92      0.92      0.92      3089


Confusion Matrix : 
 [[ 622  156]
 [  96 2215]]


## Word2Vec
 - Train Word2Vec model using gensim library
 - Fit the feature vectors of the reviews to Random Forest Classifier
 ##### Here's the workflow of this part.
     1. parse review to sentences bc Word2Vec model takes a list of sentences as input
     
     2. create vocab using Word2Vec model
     
     3. transform review into numerical representation by computing average feature vectors of words
     
     4. fit the average feature vectors to Random Forest Classifier

### 1. parse into sentences

In [9]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def parseSent(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(cleanText(raw_sentence, remove_stopwords, split_text=True))
    return sentences
#parse each review in set into sentences
sentences = []
for review in X_train_cleaned:
    sentences += parseSent(review, tokenizer)
print('%d parsed sentence in the training set\n'  %len(sentences))
print('Show a parsed sentence in the training set : \n',  sentences[10])

27768 parsed sentence in the training set

Show a parsed sentence in the training set : 
 ['good', 'product', 'and', 'fast', 'shipping', 'thank', 'you']


### 2. create vocab using Word2Vec

In [24]:
num_features = 400                   
min_word_count = 15               
num_workers = 4       
context = 10                                                                                          
downsampling = 1e-3 

print("Training Word2Vec model ...\n")
w2v = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count,window = context, sample = downsampling)
w2v.init_sims(replace=True)
w2v.save("w2v_model")

print("Number of words in the vocabulary list : %d \n" %len(w2v.wv.index2word)) #4016 
print("Show first 10 words in the vocalbulary list  vocabulary list: \n", w2v.wv.index2word[0:10])

Training Word2Vec model ...

Number of words in the vocabulary list : 3254 

Show first 10 words in the vocalbulary list  vocabulary list: 
 ['the', 'i', 'it', 'and', 'phone', 'a', 'to', 'is', 'this', 'for']


### 3. averaging feature vectors
For words appear in the volcabulary list, compute average feature vectors of each word. The average feature vector is the numerical represenation of the review.

In [11]:
def makeFeatureVec(review, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word) #index2word is the volcabulary list of the Word2Vec model
    isZeroVec = True
    for word in review:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
            isZeroVec = False
    if isZeroVec == False:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [12]:
X_train_cleaned = []
for review in X_train:
    X_train_cleaned.append(cleanText(review, remove_stopwords=True, split_text=True))
trainVector = getAvgFeatureVecs(X_train_cleaned, w2v, num_features)
print("Training set : %d feature vectors with %d dimensions" %trainVector.shape)
X_test_cleaned = []
for review in X_test:
    X_test_cleaned.append(cleanText(review, remove_stopwords=True, split_text=True))
testVector = getAvgFeatureVecs(X_test_cleaned, w2v, num_features)
print("Validation set : %d feature vectors with %d dimensions" %testVector.shape)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  from ipykernel import kernelapp as app


Training set : 27799 feature vectors with 300 dimensions
Validation set : 3089 feature vectors with 300 dimensions


### 4. Random Forest Classifier

In [13]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(trainVector, y_train)
predictions = rf.predict(testVector)
modelEvaluation(predictions)


Accuracy on validation set: 0.9249

AUC score : 0.8897

Classification report : 
               precision    recall  f1-score   support

           0       0.88      0.82      0.85       778
           1       0.94      0.96      0.95      2311

    accuracy                           0.92      3089
   macro avg       0.91      0.89      0.90      3089
weighted avg       0.92      0.92      0.92      3089


Confusion Matrix : 
 [[ 637  141]
 [  91 2220]]


### LSTM - no Word2Vec embedding
#### Recurrent Neural Networks (RNN), capable of learning long-term dependencies.
- LSTM with Word2Vec embedding to classify the reviews into positive and negative sentiment using Keras libarary.
    1. prepare X_train and X_test to 2D tensor
    2. train a simple LSTM 
        --> (embeddign layer => LSTM layer => dense layer)
    3. compile and fit the model using log loss function

In [14]:
top_words = 20000  #only consider top 20000 words in the corpus
maxlen = 100 
batch_size = 32
nb_classes = 2
nb_epoch = 3

tokenizer = Tokenizer(nb_words=top_words)
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

X_train_seq = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test_seq = sequence.pad_sequences(sequences_test, maxlen=maxlen)

y_train_seq = np_utils.to_categorical(y_train, nb_classes)
y_test_seq = np_utils.to_categorical(y_test, nb_classes)

print('X_train shape:', X_train_seq.shape)
print('X_test shape:', X_test_seq.shape)
print('y_train shape:', y_train_seq.shape)
print('y_test shape:', y_test_seq.shape)



X_train shape: (27799, 100)
X_test shape: (3089, 100)
y_train shape: (27799, 2)
y_test shape: (3089, 2)


In [15]:
#construct LSTM
model1 = Sequential()
model1.add(Embedding(top_words, 128, dropout=0.2))
model1.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model1.add(Dense(nb_classes))
model1.add(Activation('softmax'))
model1.summary()

#https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c why i used adam optimizer
#https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/ choosing loss
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model1.fit(X_train_seq, y_train_seq, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)

score = model1.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 2,691,842
Trainable params: 2,691,842
Non-trainable params: 0
_________________________________________________________________


  
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss : 0.1708
Test accuracy : 0.9440


In [22]:
#weight matrix of embedding layer
model1.layers[0].get_weights()[0] 
print("Size of weight matrix in the embedding layer : ", model1.layers[0].get_weights()[0].shape) 

#weight matrix of hidden layer
print("Size of weight matrix in the hidden layer : ", model1.layers[1].get_weights()[0].shape)

#weight matrix of output layer
print("Size of weight matrix in the output layer : ", model1.layers[2].get_weights()[0].shape) 

Size of weight matrix in the embedding layer :  (20000, 128)
Size of weight matrix in the hidden layer :  (128, 512)
Size of weight matrix in the output layer :  (128, 2)


## LSTM with Word2Vec Embedding
##### In the LSTM model constructed above, the embedding class in Keras does not take the semantic similarity of the words into account. The model assigns random weights to the embedding layer and learn the embeddings by minimizing the global error of the network.
- Instead of using random weights, this will use the pretrained word embeddings to initialize the weight of an embedding layer. Use Word2Vec embedding trained in to  intialize the weights of embedding layer in LSTM.
    1. Load pretrained word embedding model
    2. Construct embedding layer using embedding matrix as weights
    3. Train a LSTM with Word2Vec embedding (embeddign layer => LSTM layer => dense layer)
    4. Compile and fit the model using log loss function and ADAM optimizer

In [26]:
#https://towardsdatascience.com/introduction-to-word-embedding-and-word2vec-652d0c2060fa used this to w2v embedding
w2v = Word2Vec.load("w2v_model")

embedding_matrix = w2v.wv.syn0 
print("Shape of embedding matrix : ", embedding_matrix.shape) 


Shape of embedding matrix :  (3254, 400)


  after removing the cwd from sys.path.


In [28]:
top_words = embedding_matrix.shape[0]
maxlen = 100 
batch_size = 32
nb_classes = 2
nb_epoch = 3

tokenizer = Tokenizer(nb_words=top_words)
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

X_train_seq = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test_seq = sequence.pad_sequences(sequences_test, maxlen=maxlen)

y_train_seq = np_utils.to_categorical(y_train, nb_classes)
y_test_seq = np_utils.to_categorical(y_test, nb_classes)

print('X_train shape:', X_train_seq.shape) 
print('X_test shape:', X_test_seq.shape) 
print('y_train shape:', y_train_seq.shape)
print('y_test shape:', y_test_seq.shape)


#w2v embedding layer
embedding_layer = Embedding(embedding_matrix.shape[0],embedding_matrix.shape[1],weights=[embedding_matrix])


#LSTM with embedding layer
model2 = Sequential()
model2.add(embedding_layer)
model2.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model2.add(Dense(nb_classes))
model2.add(Activation('softmax'))
model2.summary()
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.fit(X_train_seq, y_train_seq, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)

score = model2.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.3f}'.format(score[0]))
print('Test accuracy : {:.3f}'.format(score[1]))



X_train shape: (27799, 100)
X_test shape: (3089, 100)
y_train shape: (27799, 2)
y_test shape: (3089, 2)




Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 400)         1301600   
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               270848    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 1,572,706
Trainable params: 1,572,706
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss : 0.162
Test accuracy : 0.941


In [19]:
#weight matrix of embedding layer
print("Size of weight matrix in the embedding layer : ",model2.layers[0].get_weights()[0].shape) 
#weight matrix of hidden layer
print("Size of weight matrix in the hidden layer : ",model2.layers[1].get_weights()[0].shape)

#weight matrix of output layer
print("Size of weight matrix in the output layer : ", model2.layers[2].get_weights()[0].shape)

Size of weight matrix in the embedding layer :  (4016, 300)
Size of weight matrix in the hidden layer :  (300, 512)
Size of weight matrix in the output layer :  (128, 2)
