## Predict Movie Review Sentiment Positive/Negative

### Inspiration: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/

### data: www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz


In [1]:
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


## Keras Tokenizer

In [3]:
text = "Texas is the    second-largest U.S. state, after Alaska, with an area of 268,820 square miles (696,200 km2). \
The name Texas, based on the Caddo word táyshaʼ (/t'ajʃaʔ/) 'friend', was applied, in the spelling Tejas or Texas, \
[17] by the Spanish to the Caddo themselves, specifically the Hasinai Confederacy,[18] the final -s representing the Spanish plural."

doc1 = ["Texas is the    second-largest U.S. state, after Alaska, with an area of 268,820 square miles (696,200 km2)."]
doc2 = ["The name Texas, based on the Caddo word táyshaʼ (/t'ajʃaʔ/) 'friend', was applied, in the spelling Tejas or Texas, \
[17] by the Spanish to the Caddo themselves, specifically the Hasinai Confederacy,[18] the final -s representing the Spanish plural."]


from keras.preprocessing.text import Tokenizer
top_used = 4
print("top_used", top_used)

# initialize tokenizer
tk = Tokenizer(num_words=(top_used), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token='<unk>')

# fit tokenizer
tk.fit_on_texts(doc1)
tk.fit_on_texts(doc2)

## **Key Step to fix Tokenizer**
tk.word_index = {e:i for e,i in tk.word_index.items() if i <= top_used} # <= because tokenizer is 1 indexed
tk.word_index[tk.oov_token] = top_used + 1


sorted_words = {key:value for key, value in sorted(tk.word_index.items(), key=lambda kv: (kv[1], kv[0]))}
print(sorted_words)

test = ["I live in Texas"]
encoded = tk.texts_to_sequences(test)
print(test)
print(encoded)

hot_emb = pad_sequences(encoded, maxlen=5, padding='post')
print(hot_emb)

top_used 4
{'the': 2, 'texas': 3, 's': 4, '<unk>': 5}
['I live in Texas']
[[5, 5, 5, 3]]
[[5 5 5 3 0]]


In [4]:
""" CLEAN TEXT FUNCTION """

def clean_text(text):
    import re
    from unicodedata import normalize

    # normalize unicode
    clean = normalize('NFD', text).encode('ascii', 'ignore')
    clean = clean.decode('utf-8')

    # remove punctuation
    number_handler = re.compile(r'(?<=\d),(?=\d)')
    punct_re = re.compile('[{}]'.format(re.escape('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-')))

    abreviation = re.compile('[^a-zA-Z0-9-_.]')
    clean = abreviation.sub(' ', clean)

    clean = number_handler.sub('',clean)
    clean = punct_re.sub(' ', clean)

    # remove any double whitespace
    clean = ' '.join(clean.split())

    return clean


""" PASS THE FOLDER PATH """

def parse_folder(path):
    import os
    import nltk
    from nltk.tokenize import word_tokenize
    nltk.download('punkt')
    text_data = []
    files = sorted(os.listdir(path), reverse=True)
    for file in files:
        with open(path + file, 'r') as f:
          # read text file
          text = f.read()
          # clean text data
          text = clean_text(text)
          # tokenize text
          words = word_tokenize(text)
          # append to data
          text_data.append(words)
    return text_data

In [5]:
path  = "./review_polarity/txt_sentoken/"

# input data
X = []
# output data
y = []

pos_rev = parse_folder(path=path + "pos/")
neg_rev = parse_folder(path=path + "neg/")

for rev in pos_rev:
    X.append(rev)
    y.append(1)

for rev in neg_rev:
    X.append(rev)
    y.append(0)

# split data train - test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print("Train: ",len(y_train))
print("Test: ",len(y_test))

[nltk_data] Downloading package punkt to /home/gm0234/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/gm0234/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Train:  1700
Test:  300


In [7]:
top_used = 2000

# initialize tokenizer
tk = Tokenizer(num_words=(top_used), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token='<unk>')

# fit tokenizer
for i, review in enumerate(X_train):
    tk.fit_on_texts([' '.join(review)])
    # have a feedback
    if i % 100 == 0:
        print(i,'/',len(X_train))

## **Key Step to fix Tokenizer**
tk.word_index = {e:i for e,i in tk.word_index.items() if i <= top_used} # <= because tokenizer is 1 indexed
tk.word_index[tk.oov_token] = top_used + 1

0 / 1700
100 / 1700
200 / 1700
300 / 1700
400 / 1700
500 / 1700
600 / 1700
700 / 1700
800 / 1700
900 / 1700
1000 / 1700
1100 / 1700
1200 / 1700
1300 / 1700
1400 / 1700
1500 / 1700
1600 / 1700


In [8]:
sorted_words = {key:value for key, value in sorted(tk.word_index.items(), key=lambda kv: (kv[1], kv[0]))}
print(sorted_words)

{'the': 2, 'a': 3, 'and': 4, 'of': 5, 'to': 6, 'is': 7, 'in': 8, 's': 9, 'it': 10, 'that': 11, 'as': 12, 'with': 13, 'for': 14, 'this': 15, 'film': 16, 'his': 17, 'i': 18, 'he': 19, 'but': 20, 'on': 21, 'are': 22, 't': 23, 'be': 24, 'by': 25, 'one': 26, 'not': 27, 'movie': 28, 'an': 29, 'who': 30, 'you': 31, 'at': 32, 'from': 33, 'they': 34, 'have': 35, 'was': 36, 'has': 37, 'her': 38, 'all': 39, 'there': 40, 'like': 41, 'so': 42, 'out': 43, 'about': 44, 'up': 45, 'what': 46, 'more': 47, 'when': 48, 'she': 49, 'or': 50, 'which': 51, 'their': 52, 'can': 53, 'some': 54, 'just': 55, 'if': 56, 'we': 57, 'him': 58, 'into': 59, 'even': 60, 'no': 61, 'only': 62, 'than': 63, 'good': 64, 'time': 65, 'its': 66, 'most': 67, 'will': 68, 'story': 69, 'would': 70, 'been': 71, 'much': 72, 'character': 73, 'also': 74, 'get': 75, 'do': 76, 'other': 77, 'well': 78, 'characters': 79, 'them': 80, 'two': 81, 'very': 82, 'first': 83, 'see': 84, 'after': 85, 'because': 86, 'way': 87, 'make': 88, 'really': 89

In [9]:
# sequence encode
X_train = tk.texts_to_sequences(X_train)
X_test = tk.texts_to_sequences(X_test)

# pad sequences
max_length = max([len(s) for s in X_train])
print("Max Length Review: ",max_length)

X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# define vocabulary size (largest integer value)
vocab_size = len(tk.word_index)
print("Vocabulary size: ",vocab_size)

Max Length Review:  2462
Vocabulary size:  2000


In [10]:
# define model
model = Sequential()
model.add(Embedding(vocab_size+2, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2462, 100)         200200    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2455, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1227, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 39264)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                392650    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 618,493
Trainable params: 618,493
Non-trainable params: 0
_________________________________________________________________
None

In [11]:
# fit network - try 10 epochs
model.fit(X_train, y_train, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2aab718b2cc0>

In [12]:
# evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 66.000000
