In [1]:
import urllib.request
import os
import tarfile
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "../Dataset/IMDB/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url, filepath)
    print('download: ', result)
    
if not os.path.exists("../Dataset/IMDB/aclImdb"):
    tfile = tarfile.open("../Dataset/IMDB/aclImdb_v1.tar.gz", 'r:gz')
    result = tfile.extractall('../Dataset/IMDB/')

In [2]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

def read_files(filetype):
    path = "../Dataset/IMDB/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
        
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
        
    print('read', filetype, 'files: ', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)    
    all_texts = []
    
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels, all_texts

In [3]:
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")
print (train_text[0])
print (y_train[0])

read train files:  25000
read test files:  25000
For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.
1


In [4]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
token = Tokenizer(num_words = 4000)
token.fit_on_texts(train_text)
print(token.document_count)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


25000


In [5]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)
x_train = sequence.pad_sequences(x_train_seq, maxlen = 400)
x_test = sequence.pad_sequences(x_test_seq, maxlen = 400)

print('Before pad_sequences length=', len(x_train_seq[0]))
print(x_train_seq[0])

print('After pad_sequences length=', len(x_train[0]))
print(x_train[0])

Before pad_sequences length= 43
[14, 3, 16, 11, 210, 53, 1157, 46, 248, 22, 3, 172, 4, 902, 3558, 14, 10, 1524, 833, 3, 16, 117, 912, 6, 161, 158, 6, 3, 132, 1, 105, 6, 31, 1551, 2030, 102, 14, 1604, 1, 1787, 13, 3, 564]
After pad_sequences length= 400
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0  

# Multilayer Perceptron

In [6]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

model = Sequential()
model.add(Embedding(output_dim = 32, input_dim = 4000, input_length = 400))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
train_history = model.fit(x_train, y_train, batch_size = 100, epochs = 10, verbose = 2, validation_split = 0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               3277056   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 3,405,313
Trainable params: 3,405,313
Non-trainable params: 0
_________________________________________________________________


In [16]:
scores = model.evaluate(x_test, y_test, verbose = 1)
print(scores[0])
predict = model.predict_classes(x_test)
print(predict[:10])
predict_classes = predict.reshape(-1)
print(predict_classes[:10])

0.8552


In [8]:
SentimentDict = {1: 'positive', 0: 'negative'}
def display_test_Sentiment(idx):
    print(test_text[idx])
    print('Label: ', SentimentDict[y_test[idx]], 
          'Prediction: ', SentimentDict[predict_classes[idx]])

display_test_Sentiment(2)

I really like this show. It has drama, romance, and comedy all rolled into one. I am 28 and I am a married mother, so I can identify both with Lorelei's and Rory's experiences in the show. I have been watching mostly the repeats on the Family Channel lately, so I am not up-to-date on what is going on now. I think females would like this show more than males, but I know some men out there would enjoy it! I really like that is an hour long and not a half hour, as th hour seems to fly by when I am watching it! Give it a chance if you have never seen the show! I think Lorelei and Luke are my favorite characters on the show though, mainly because of the way they are with one another. How could you not see something was there (or take that long to see it I guess I should say)? Happy viewing!
Label:  positive Prediction:  positive


In [9]:
input_text = "Infinity War was hyped up to be an incredible spectacle, but although there were some incredible moments, especially with most one-liners being absolutely on point, some parts feel unnecessary; some drag on somewhat (Vision is a boring chap); and the whole film, especially the start, feels overtly Star Wars-y, which really confuses the viewing experience of a Marvel film, and leaves it overall as acceptably good, but not great."
input_seq = token.texts_to_sequences([input_text])
print(input_seq[0])
print(len(input_seq[0]))

pad_input_seq = sequence.pad_sequences(input_seq, maxlen = 400)
print(len(pad_input_seq[0]))

predict_result = model.predict_classes(pad_input_seq)
predict_result[0][0]

print(SentimentDict[predict_result[0][0]])

[321, 12, 52, 5, 25, 31, 1044, 17, 258, 46, 67, 45, 1044, 384, 257, 15, 87, 27, 2431, 108, 423, 19, 209, 45, 527, 230, 1745, 45, 2382, 19, 639, 1768, 6, 3, 354, 2, 1, 222, 18, 257, 1, 376, 760, 319, 1643, 59, 62, 1, 823, 581, 4, 3, 18, 2, 885, 8, 443, 13, 48, 17, 20, 83]
62
400
positive


In [10]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen = 400)
    predict_result = model.predict_classes(pad_input_seq)
    print(SentimentDict[predict_result[0][0]])
    
predict_review(input_text)

positive


# Recurrent Neural Network

In [11]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

model = Sequential()
model.add(Embedding(output_dim = 32, input_dim = 4000, input_length = 400))
model.add(Dropout(0.35))
model.add(SimpleRNN(units = 16)) #number of RNN layers
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
train_history = model.fit(x_train, y_train, batch_size = 100, epochs = 10, verbose = 2, validation_split = 0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 133,393
Trainable params: 133,393
Non-trainable params: 0
_________________________________________________________________
Trai

In [12]:
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.8358

In [13]:
#add LSTM layer

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

model = Sequential()
model.add(Embedding(output_dim = 32, input_dim = 4000, input_length = 400))
model.add(Dropout(0.35))
model.add(LSTM(units = 16))
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
train_history = model.fit(x_train, y_train, batch_size = 100, epochs = 10, verbose = 2, validation_split = 0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 135,745
Trainable params: 135,745
Non-trainable params: 0
_________________________________________________________________
Trai

In [14]:
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.8552