# Movie Review Sentiment Analysis with (CNN and Word Embedding Model)

# Stage1. Import Dependencies

In [59]:
from string import punctuation
from os import listdir
import nltk
from collections import Counter
from nltk.corpus import stopwords
import pandas as pd
import re
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D



In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Stage2. Data Preparation


1.   Loading and cleaning the data to remove punctuation and numbers.
2.   Defining a vocabulary of preferred words.




In [13]:
df=pd.read_csv('/content/sample_data/MovieReview.csv')

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,"Petter Mattei's ""Love in the Time of Money"" is...",1
4,"Probably my all-time favorite movie, a story o...",1


In [29]:
def clean_df(df):
  corpus=[]
  for i in range(0,len(df)):
    review=re.sub("[^a-zA-Z]",' ',df['review'][i])
    review=review.lower()
    review=review.split()
    # #remove remaining tokens that are not alphabetic
    review=[word for word in review if str(word).isalpha]
    # remove stopwords
    review=[word for word in review if not word in set(stopwords.words('english'))]
    #filter out short tokens
	  
    review=[word for word in review if len(word)>1]
    corpus.append(review)

  flat_tokens=[item for sublist in corpus for item in sublist]
  return flat_tokens

In [72]:
  tokens=clean_df(df)


## Define a Vocabulary

* It is important to define a vocabulary of known words when using a bag-of-words or embedding model.

* The more words, the larger the representation of documents, therefore it is important to constrain the words to only those believed to be predictive. 

In [35]:
def add_token_to_vocab(df,vocab):
  # call the clean_df function to get clean tokens
  tokens=clean_df(df)
  # update counts of vocab with tokens
  vocab.update(tokens)

In [36]:
# Define vocab
vocab= Counter()
# Add all docs to vocab
add_token_to_vocab(df,vocab)
# print the len of vocab
print(len(vocab))
# print the top 50 words of vocab
print(vocab.most_common(50))

17510
[('br', 4130), ('movie', 1790), ('film', 1558), ('one', 1009), ('like', 804), ('good', 540), ('even', 528), ('see', 525), ('would', 497), ('story', 462), ('time', 457), ('really', 456), ('get', 414), ('well', 412), ('much', 400), ('people', 364), ('bad', 353), ('made', 353), ('first', 352), ('way', 347), ('also', 347), ('great', 345), ('movies', 339), ('think', 314), ('make', 307), ('plot', 292), ('watch', 288), ('many', 285), ('characters', 278), ('never', 275), ('two', 273), ('little', 271), ('life', 271), ('could', 271), ('character', 270), ('films', 265), ('best', 264), ('seen', 256), ('know', 250), ('acting', 249), ('man', 241), ('ever', 231), ('show', 230), ('go', 229), ('scene', 229), ('scenes', 227), ('love', 220), ('better', 219), ('back', 217), ('still', 208)]


We can step through the vocabulary and remove all words that have a low occurrence, such as only being used once or twice in all reviews.

In [41]:
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
tokens = [each for each in tokens if each!='br']
print(len(tokens))


9277


In [42]:
# Save vocab list to file
def save_list(lines,filename):
  # convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

In [43]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

Now we have vocab.txt with only the words we are interested in.

# 3. Train Embedding Layer

* In this section, we will learn a word embedding while training a neural network on the classification problem.

* A word embedding is a way of representing text where each word in the vocabulary is represented by a real valued vector in a high-dimensional space. The vectors are learned in such a way that words that have similar meanings will have similar representation in the vector space (close in the vector space). 

* The real valued vector representation for words can be learned while training the neural network. We can do this in the Keras deep learning library using the Embedding layer.

In [44]:
# load vocab doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

### we need to load all of the training data movie reviews. updated_clean_doc

In [56]:
def updated_clean_doc(df,vocab):
  corpus=[]

  for i in range(0,len(df)):
    review = df['review'][i]
    review = review.lower()
    review = review.split()
    
    # filter out tokens not in vocab
    review = [w for w in review  if w in vocab]

    review = ' '.join(review)
    corpus.append(review)

  return corpus

In [58]:
# load training reviews
train_review=updated_clean_doc(df,vocab)

* The next step is to encode each document as a `sequence of integers`.

* The `Keras Embedding` layer requires integer inputs where each integer maps to a single token that has a specific real-valued vector representation within the embedding. These vectors are random at the beginning of training, but during training become meaningful to the network.

* We can encode the training documents as sequences of integers using the `Tokenizer` class in the Keras API.

In [60]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_review)

# encode the reviews in the training dataset. We can do that by calling the texts_to_sequences()
# function on the Tokenizer.

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_review)
	
# pad sequences
max_length = max([len(s.split()) for s in train_review])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [61]:
# Define the training labels
ytrain=array(df['sentiment'])

## Load test dataset

In [63]:
df_test=pd.read_csv('/content/sample_data/movie_review_test.csv')
df_test.head()

Unnamed: 0,review,sentiment
0,Nothing is sacred. Just ask Ernie Fosselius. T...,1
1,This film really used its locations well with ...,1
2,Strangely enough this movie never made it to t...,1
3,"In Nordestina, a village in the middle of nowh...",1
4,"""I like cheap perfume better; it doesn't last ...",1


In [70]:
# preprocessing with test data
test_review=updated_clean_doc(df_test,vocab)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_review)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define the test labels
ytest=array(df_test['sentiment'])

## We are now ready to define our neural network model.

* The model will use an Embedding layer as the first hidden layer. 
* The Embedding requires the specification of the vocabulary size, the size of the real-valued vector space, and the maximum length of input documents.

In [66]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

The complete model definition is listed below including the Embedding layer.

* We use a Convolutional Neural Network (CNN) as they have proven to be successful at document classification problems.

* the 2D output from the CNN part of the model is flattened to one long 2D vector to represent the ‘features’ extracted by the CNN. 
* The back-end of the model is a standard Multilayer Perceptron layers to interpret the CNN features.
* The output layer uses a sigmoid activation function to output a value between 0 and 1 for the negative and positive sentiment in the review.

In [67]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 486, 100)          883500    
_________________________________________________________________
conv1d (Conv1D)              (None, 479, 32)           25632     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 239, 32)           0         
_________________________________________________________________
flatten (Flatten)            (None, 7648)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                76490     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 985,633
Trainable params: 985,633
Non-trainable params: 0
________________________________________________

## Next, we fit the network on the training data.

* We use a binary cross entropy loss function because the problem we are learning is a binary classification problem. 
* The efficient Adam implementation of stochastic gradient descent is used and we keep track of accuracy in addition to loss during training. 
* The model is trained for 10 epochs, or 10 passes through the training data.

In [68]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
32/32 - 3s - loss: 0.6959 - accuracy: 0.4880
Epoch 2/10
32/32 - 3s - loss: 0.6377 - accuracy: 0.6920
Epoch 3/10
32/32 - 3s - loss: 0.3684 - accuracy: 0.9500
Epoch 4/10
32/32 - 3s - loss: 0.0666 - accuracy: 0.9940
Epoch 5/10
32/32 - 3s - loss: 0.0100 - accuracy: 1.0000
Epoch 6/10
32/32 - 3s - loss: 0.0040 - accuracy: 1.0000
Epoch 7/10
32/32 - 3s - loss: 0.0023 - accuracy: 1.0000
Epoch 8/10
32/32 - 3s - loss: 0.0016 - accuracy: 1.0000
Epoch 9/10
32/32 - 3s - loss: 0.0012 - accuracy: 1.0000
Epoch 10/10
32/32 - 3s - loss: 8.3461e-04 - accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x7ff1632e9cf8>

After the model is fit, it is evaluated on the test dataset. This dataset contains words that we have not seen before and reviews not seen during training.

In [71]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 76.400000
