# Sentiment analysis
#### Louis Ehwerhemuepha, PhD and Afnan (Nana) Alqahtani, MS

In [0]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.stem import PorterStemmer as ps


from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers, losses
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from wordcloud import WordCloud


Retrieve Data for Sentiment Analysis from Louis' Github Repository

In [0]:
# get data from Louis' github repository
! git clone https://github.com/ehwerhemuepha/sentimentanalysis.git

In [0]:
! ls sentimentanalysis # list cwd contents

In [0]:
d = pd.read_csv('sentimentanalysis/AmazonProductReviewKaggle.txt',sep='\t', encoding='ISO-8859-1', names = ['id', 'rating', 'reviewtext', 'reviewtitle'],skiprows=[0])

In [0]:
d.head()

In [0]:
d.shape

In [0]:
# read the review texts
reviewsTitle=d['reviewtitle'].values
reviewsText=d['reviewtext'].values
reviews = [str(reviewsText[i]).lower() + ' ' + str(reviewsTitle[i]).lower() for i in range(len(reviewsTitle))]

# remove noise
reviews = [re.sub("(<.*?>)", "", elem).strip() for elem in reviews] #HTML tags
reviews = [re.sub("(\W|\d+)", " ", elem).strip() for elem in reviews] # spaces and digits

# perform stemming -- words such as (run, running, ran, runs) -> run

In [0]:

rating=[1 if elem>3 else 0 for elem in d['rating'].values] # 1 a rating of 4 or 5; 0 otherwise
rating_categorical = to_categorical(rating) # for multi-label predictionn

In [0]:
min(rating), max(rating)

In [0]:
reviews[0:9], rating[0:9], rating_categorical[0:9]

In [0]:
# optionally perform stemming -- improved accuracy though somewhat imperfect
# 
stemmer = ps()
def stemlist(word_list):
  return ' '.join([stemmer.stem(word=word) for word in word_list.split()])

reviews = [stemlist(elem) for elem in reviews]
  

In [0]:
d.shape, len(reviews), len(rating)

In [0]:
reviews[0:9], rating[0:9], rating_categorical[0:9]

In [0]:
# Word frequencies via wordcloud

complete_text = ''
for elem in reviews:
  complete_text += elem
  
import matplotlib.pyplot as plt
%matplotlib inline

wc = WordCloud(max_words=200, height = 1000, width = 1000, 
              background_color = 'white').generate(complete_text)
fig = plt.figure(1, figsize=(10,10))

plt.imshow(wc)
plt.show()
complete_text=''

Split your data into training and test set 

In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(reviews, rating_categorical, test_size=0.20, random_state=727)

In [0]:
xtrain[0:4], ytrain[0:4]

Tokenization

Tokenization is the breaking up of sentences into tokens (aka words, in English). 

In [0]:
tokenizer = Tokenizer(num_words=2000) # get a count of all tokens and use the 1000 most common only
tokenizer.fit_on_texts(xtrain)
xtrain = tokenizer.texts_to_sequences(xtrain)
xtest = tokenizer.texts_to_sequences(xtest)
vocab_size = len(tokenizer.word_index) + 1 # plus index 0

In [0]:
xtrain[0], xtest[0], vocab_size


In [0]:
len(xtrain) , len(xtest), len(xtrain) + len(xtest)

In [0]:
list(tokenizer.word_counts.items())[0:19], len(tokenizer.word_counts) # tokens and token frequencies

Pad in order to ensure that all sequences have the same length


In [0]:
maxlen = 100 # assumes the first 100 words are the most important in the review (especially in good ones) and that most review may have less than 50 workds == adjust too see effect on accuracy if any
xtrain = pad_sequences(xtrain, padding='pre', maxlen=maxlen) 
xtest = pad_sequences(xtest, padding='pre', maxlen=maxlen)

In [0]:
xtrain[0], xtest[0]

Build model and get model performance 

In [0]:
# RNN model
rnnmodel = Sequential()
rnnmodel.add(layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=maxlen))
# rnnmodel.add(layers.Embedding(50, 10))
rnnmodel.add(layers.SimpleRNN(units=50))
rnnmodel.add(layers.Dense(2, activation='softmax'))
rnnmodel.compile(optimizer='adam',
              loss=losses.categorical_crossentropy,
              metrics=['accuracy'])
rnnmodel.summary()

In [0]:
epochs = 20
batch_size = 256

rnnmodel.fit(xtrain, ytrain, epochs=epochs, batch_size=batch_size)
loss, accuracy = rnnmodel.evaluate(xtrain, ytrain)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = rnnmodel.evaluate(xtest, ytest)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [0]:
# GRU model
grumodel = Sequential()
grumodel.add(layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=maxlen))
grumodel.add(layers.GRU(units=50))
grumodel.add(layers.Dense(2, activation='softmax'))
grumodel.compile(optimizer='adam',
              loss=losses.categorical_crossentropy,
              metrics=['accuracy'])
grumodel.summary()



In [0]:
grumodel.fit(xtrain, ytrain, epochs=epochs, batch_size=batch_size)
loss, accuracy = grumodel.evaluate(xtrain, ytrain, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = grumodel.evaluate(xtest, ytest, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [0]:
# LSTM model
lstm_model = Sequential()
lstm_model.add(layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=maxlen))
lstm_model.add(layers.LSTM(units=50))
lstm_model.add(layers.Dense(2, activation='softmax'))
lstm_model.compile(optimizer='adam',
              loss=losses.categorical_crossentropy,
              metrics=['accuracy'])
lstm_model.summary()



In [0]:
lstm_model.fit(xtrain, ytrain, epochs=epochs, batch_size=batch_size)
loss, accuracy = lstm_model.evaluate(xtrain, ytrain, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm_model.evaluate(xtest, ytest, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))