Example adapted from: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

Required instals:
1. jupyter notebooks
2. keras 
3. tensorflow

My install process was:
1. follow instructions for python virtual environment (Virtualenv) install at https://www.tensorflow.org/install/
2. install keras python env using https://keras.io/#installation
3. install jupyter notebooks (http://jupyter.org/install) and set up a tensorflow kernel that uses the virtualenv set up above. 
4. start jupyter notebooks in a parent directory of this notebook and open this notebook. Make sure the Tensorflow Virtualenv jupyter kernel is active when running the notebook.
The logs of my install are at: https://www.evernote.com/l/ACtXalW9qSpOVZOUU04V2ATOmJOvw4Ffido

In [1]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

Using TensorFlow backend.
  return f(*args, **kwds)


# Load IMDB Dataset

In [2]:
# load the dataset but only keep the top n words, zero the rest
# docs at: https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data 
top_words = 5000
start_char=1
oov_char=2
index_from=3
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words, 
                                start_char=start_char, oov_char = oov_char, index_from = index_from )

In [3]:
print(X_train.shape)
print(y_train.shape)

(25000,)
(25000,)


In [4]:
print(len(X_train[0]))
print(len(X_train[1]))

218
189


In [5]:
print(X_test.shape)
print(y_test.shape)

(25000,)
(25000,)


In [26]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Pad sequences so they are all the same length (required by keras/tensorflow).

In [6]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [7]:
print(X_train.shape)
print(y_train.shape)

(25000, 500)
(25000,)


In [8]:
print(len(X_train[0]))
print(len(X_train[1]))

500
500


In [9]:
print(X_test.shape)
print(y_test.shape)

(25000, 500)
(25000,)


In [10]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [11]:
y_train[0:20]  # first 20 sentiment labels

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1])

# Setup Vocabulary Dictionary
The index value loaded differes from the dictionary value by "index_from" so that special characters for padding, start of sentence, and out of vocabulary can be prepended to the start of the vocabulary.

In [12]:
word_index = imdb.get_word_index()
inv_word_index = np.empty(len(word_index)+index_from+3, dtype=np.object)
for k, v in word_index.items():
    inv_word_index[v+index_from]=k

inv_word_index[0]='<pad>'    
inv_word_index[1]='<start>'
inv_word_index[2]='<oov>' 

In [13]:
word_index['ai']

16942

In [14]:
inv_word_index[16942+index_from]

'ai'

In [15]:
inv_word_index[:50]

array(['<pad>', '<start>', '<oov>', None, 'the', 'and', 'a', 'of', 'to',
       'is', 'br', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for',
       'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'are', 'his',
       'have', 'he', 'be', 'one', 'all', 'at', 'by', 'an', 'they', 'who',
       'so', 'from', 'like', 'her', 'or', 'just', 'about', "it's", 'out',
       'has', 'if', 'some'], dtype=object)

# Convert Encoded Sentences to Readable Text

In [16]:
def toText(wordIDs):
    s = ''
    for i in range(len(wordIDs)):
        if wordIDs[i] != 0:
            w = str(inv_word_index[wordIDs[i]])
            s+= w + ' '
    return s

In [17]:
for i in range(5):
    print()
    print(str(i) + ') sentiment = ' + ('negative' if y_train[i]==0 else 'positive'))
    print(toText(X_train[i]))


0) sentiment = positive
<start> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <oov> is an amazing actor and now the same being director <oov> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <oov> and would recommend it to everyone to watch and the fly <oov> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <oov> to the two little <oov> that played the <oov> of norman and paul they were just brilliant children are often left out of the <oov> list i think because the stars that play them all grown up are such a big <oov> for the whole film but these children are amazing and shou

# Build the model
[Sequential guide, compile() and fit()](https://keras.io/getting-started/sequential-model-guide/) 

[Embedding](https://keras.io/layers/embeddings/) The embeddings layer works like an effiecient one hot encoding for the word index followed by a dense layer of size embedding_vector_length.

[LSTM (middle of page)](https://keras.io/layers/recurrent/)

[Dense](https://keras.io/layers/core/)

"model.compile(...) sets up the "adam" optimizer, similar to SGD but with some gradient averaging that works like a larger batch size to reduce the variability in the gradient from one small batch to the next.  Each SGD step is of batch_size training records. Adam is also a variant of momentum optimizers.

'binary_crossentropy' is the loss functiom used most often with logistic regression and is equivalent to softmax for only two classes.

In the "Output Shape", None is a unknown for a variable number of training records to be supplied later.

In [18]:
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
#model.add(LSTM(100))
model.add(GRU(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               39900     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 200,001
Trainable params: 200,001
Non-trainable params: 0
_________________________________________________________________
None


# Train the Model

Each epoch takes about 3 min. You can reduce the epochs to 3 for a faster build and still get good accuracy. Overfitting starts to happen at epoch 8 or 9.

Note: you can run this cell multiple times to add more epochs to the model training.


In [19]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=7, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x115e19a90>

# Accuracy on the Test Set

In [20]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 88.32%


# Evaluate on Custom Text

In [21]:
import re
words_only = r'[^\s!,.?\-":;0-9]+'
re.findall(words_only, "Some text to, tokenize. something's.Something-else?".lower())

['some', 'text', 'to', 'tokenize', "something's", 'something', 'else']

In [22]:
def encode(reviewText):

    words = re.findall(words_only, reviewText.lower())
    reviewIDs = [start_char]
    for word in words:
        index = word_index.get(word, oov_char -index_from) + index_from # defaults to oov_char for missing
        if index > top_words:
            index = oov_char
        reviewIDs.append(index)  
    return reviewIDs

toText(encode('To code and back again. ikkyikyptangzooboing ni !!'))


'<start> to code and back again <oov> <oov> '

In [23]:
# reviews from: 
# https://www.pluggedin.com/movie-reviews/solo-a-star-wars-story
# http://badmovie-badreview.com/category/bad-reviews/

user_reviews = ["This movie is horrible",
         "This wasn't a horrible movie",
         "This movie was great.",
         "What a waste of time. It was too long and didn't make any sense.",
         "This was boring and drab.",
         "I liked the movie.",
         "I didn't like the movie.",
         "I like the lead actor but the movie as a whole fell flat",
         "There are definitely heroic seeds at our favorite space scoundrel's core, though, seeds that simply need a little life experience to nurture them to growth. And that's exactly what this swooping heist tale is all about. You get a yarn filled with romance, high-stakes gambits, flashy sidekicks, a spunky robot and a whole lot of who's-going-to-outfox-who intrigue. Ultimately, it's the kind of colorful adventure that one could imagine Harrison Ford's version of Han recalling with a great deal of flourish … and a twinkle in his eye.",
         "There are times to be politically correct and there are times to write things about midget movies, and I’m afraid that sharing Ankle Biters with the wider world is an impossible task without taking the low road, so to speak. There are horrible reasons for this, all of them the direct result of the midgets that this film contains, which makes it sound like I am blaming midgets for my inability to regulate my own moral temperament but I like to think I am a…big…enough person (geddit?) to admit that the problem rests with me, and not the disabled.",
         "While Beowulf didn’t really remind me much of Beowulf, it did reminded me of something else. At first I thought it was Van Helsing, but that just wasn’t it. It only hit me when Beowulf finally told his backstory and suddenly even the dumbest of the dumb will realise that this is a simple ripoff of Blade. The badass hero, who is actually born from evil, now wants to destroy it, while he apparently has to fight his urges to become evil himself (not that it is mentioned beyond a single reference at the end of Beowulf) and even the music fits into the same range. Sadly Beowulf is not even nearly as interesting or entertaining as its role model. The only good aspects I can see in Beowulf would be the stupid beginning and Christopher Lamberts hair. But after those first 10 minutes, the movie becomes just boring and you don’t care much anymore."
               ]

X_user = np.array([encode(review) for review in user_reviews ])
X_user


array([list([1, 14, 20, 9, 527]), list([1, 14, 286, 6, 527, 20]),
       list([1, 14, 20, 16, 87]),
       list([1, 51, 6, 437, 7, 58, 12, 16, 99, 196, 5, 161, 97, 101, 281]),
       list([1, 14, 16, 357, 5, 2]), list([1, 13, 423, 4, 20]),
       list([1, 13, 161, 40, 4, 20]),
       list([1, 13, 40, 4, 485, 284, 21, 4, 20, 17, 6, 226, 1583, 1035]),
       list([1, 50, 26, 407, 3818, 2, 33, 263, 514, 834, 2, 2026, 151, 2, 15, 331, 359, 6, 117, 113, 585, 8, 2, 98, 8, 2, 5, 198, 618, 51, 14, 2, 2, 787, 9, 32, 44, 25, 79, 6, 2, 1061, 19, 883, 312, 2, 2, 2, 2, 6, 2, 2362, 5, 6, 226, 176, 7, 871, 170, 8, 2, 37, 4030, 1116, 45, 4, 243, 7, 3221, 1154, 15, 31, 100, 838, 2, 2, 310, 7, 2, 2, 19, 6, 87, 855, 7, 2, 2, 5, 6, 2, 11, 27, 744]),
       list([1, 50, 26, 211, 8, 30, 4103, 2296, 5, 50, 26, 211, 8, 901, 183, 44, 2, 102, 5, 2, 1595, 15, 2, 2, 2, 19, 4, 2, 182, 9, 35, 1167, 2790, 209, 656, 4, 364, 1320, 38, 8, 1128, 50, 26, 527, 1007, 18, 14, 32, 7, 98, 4, 1504, 959, 7, 4, 2, 15, 14, 22, 13

In [24]:
X_user_pad = sequence.pad_sequences(X_user, maxlen=max_review_length)
X_user_pad

array([[   0,    0,    0, ...,   20,    9,  527],
       [   0,    0,    0, ...,    6,  527,   20],
       [   0,    0,    0, ...,   20,   16,   87],
       ..., 
       [   0,    0,    0, ...,   11,   27,  744],
       [   0,    0,    0, ...,   24,    4,    2],
       [   0,    0,    0, ...,  459,   76, 1627]], dtype=int32)

In [25]:
user_scores = model.predict(X_user_pad)

for i in range(len(user_reviews)):
    print(  '\n%.2f' % (user_scores[i][0]) + ' ' + user_reviews[i] )


0.03 This movie is horrible

0.04 This wasn't a horrible movie

0.95 This movie was great.

0.01 What a waste of time. It was too long and didn't make any sense.

0.02 This was boring and drab.

0.96 I liked the movie.

0.35 I didn't like the movie.

0.27 I like the lead actor but the movie as a whole fell flat

1.00 There are definitely heroic seeds at our favorite space scoundrel's core, though, seeds that simply need a little life experience to nurture them to growth. And that's exactly what this swooping heist tale is all about. You get a yarn filled with romance, high-stakes gambits, flashy sidekicks, a spunky robot and a whole lot of who's-going-to-outfox-who intrigue. Ultimately, it's the kind of colorful adventure that one could imagine Harrison Ford's version of Han recalling with a great deal of flourish … and a twinkle in his eye.

0.02 There are times to be politically correct and there are times to write things about midget movies, and I’m afraid that sharing Ankle Biters