In [1]:
import gzip
import pyprind
import pandas as pd
from string import punctuation
import re
import numpy as np
import os
from collections import Counter
import tensorflow as tf



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
print(df.head(3))

                                              review  sentiment
0  In 1974, the teenager Martha Moxley (Maggie Gr...          1
1  OK... so... I really like Kris Kristofferson a...          0
2  ***SPOILER*** Do not read this, if you think a...          0


In [3]:
## Preprocessing the data:
## Separate words and 
## count each word's occurrence

counts = Counter()
pbar = pyprind.ProgBar(len(df['review']),
                       title='Counting words occurences')
for i,review in enumerate(df['review']):
    text = ''.join([c if c not in punctuation else ' '+c+' ' \
                    for c in review]).lower()
    df.loc[i,'review'] = text
    pbar.update()
    counts.update(text.split())

Counting words occurences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:51


In [4]:
## Create a mapping:
## Map each unique word to an integer

word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in \
               enumerate(word_counts, 1)}


mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']),
                       title='Map reviews to ints')
for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()])
    pbar.update()

Map reviews to ints


['the', '.', ',', 'and', 'a']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


In [5]:
## Define fixed-length sequences:
## Use the last 200 elements of each sequence
## if sequence length < 200: left-pad with zeros

sequence_length = 200  ## sequence length (or T in our formulas)
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)
for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values


np.random.seed(123) # for reproducibility

## Function to generate minibatches:
def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x)//batch_size
    x= x[:n_batches*batch_size]
    if y is not None:
        y = y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]

In [46]:
np.random.seed(123) # for reproducibility

Building RNN

In [9]:
class SentimentRNN(object):
    def __init__(self, n_words, seq_len=200,
                 lstm_size=256, num_layers=1, batch_size=64,
                 learning_rate=0.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size   ## number of hidden units
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size

        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()

    def build(self):
        ## Define the placeholders
        tf_x = tf.placeholder(tf.int32,
                    shape=(self.batch_size, self.seq_len),
                    name='tf_x')
        tf_y = tf.placeholder(tf.float32,
                    shape=(self.batch_size),
                    name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32,
                    name='tf_keepprob')
        ## Create the embedding layer
        embedding = tf.Variable(
                    tf.random_uniform(
                        (self.n_words, self.embed_size),
                        minval=-1, maxval=1),
                    name='embedding')
        embed_x = tf.nn.embedding_lookup(
                    embedding, tf_x, 
                    name='embeded_x')

        ## Define LSTM cell and stack them together
        cells = tf.contrib.rnn.MultiRNNCell(
                [tf.contrib.rnn.DropoutWrapper(
                   tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                   output_keep_prob=tf_keepprob)
                 for i in range(self.num_layers)])

        ## Define the initial state:
        self.initial_state = cells.zero_state(
                 self.batch_size, tf.float32)
        print('  << initial state >> ', self.initial_state)

        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
                 cells, embed_x,
                 initial_state=self.initial_state)
        ## Note: lstm_outputs shape: 
        ##  [batch_size, max_time, cells.output_size]
        print('\n  << lstm_output   >> ', lstm_outputs)
        print('\n  << final state   >> ', self.final_state)

        ## Apply a FC layer after on top of RNN output:
        logits = tf.layers.dense(
                 inputs=lstm_outputs[:, -1],
                 units=1, activation=None,
                 name='logits')
        
        logits = tf.squeeze(logits, name='logits_squeezed')
        print ('\n  << logits        >> ', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {
            'probabilities': y_proba,
            'labels' : tf.cast(tf.round(y_proba), tf.int32,
                 name='labels')
        }
        print('\n  << predictions   >> ', predictions)

        ## Define the cost function
        cost = tf.reduce_mean(
                 tf.nn.sigmoid_cross_entropy_with_logits(
                 labels=tf_y, logits=logits),
                 name='cost')
        
        ## Define the optimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')

    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for batch_x, batch_y in create_batch_generator(
                            X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                            'tf_y:0': batch_y,
                            'tf_keepprob:0': 0.5,
                            self.initial_state : state}
                    loss, _, state = sess.run(
                            ['cost:0', 'train_op', 
                             self.final_state],
                            feed_dict=feed)

                    if iteration % 20 == 0:
                        print("Epoch: %d/%d Iteration: %d "
                              "| Train loss: %.5f" % (
                               epoch + 1, num_epochs,
                               iteration, loss))

                    iteration +=1
                if (epoch+1)%10 == 0:
                    self.saver.save(sess,
                        "model/sentiment-%d.ckpt" % epoch)

    def predict(self, X_data, return_proba=False):
        preds = []
        with tf.Session(graph = self.g) as sess:
            self.saver.restore(
                sess, tf.train.latest_checkpoint('model/'))
            test_state = sess.run(self.initial_state)
            for ii, batch_x in enumerate(
                create_batch_generator(
                    X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0' : batch_x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state : test_state}
                if return_proba:
                    pred, test_state = sess.run(
                        ['probabilities:0', self.final_state],
                        feed_dict=feed)
                else:
                    pred, test_state = sess.run(
                        ['labels:0', self.final_state],
                        feed_dict=feed)
                    
                preds.append(pred)
                
        return np.concatenate(preds)

In [10]:
n_words = max(list(word_to_int.values())) + 1

In [11]:


rnn = SentimentRNN(n_words=n_words, 
                   seq_len=sequence_length,
                   embed_size=256, 
                   lstm_size=128, 
                   num_layers=1, 
                   batch_size=100, 
                   learning_rate=0.0001)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
  << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equiv

In [12]:
rnn.train(X_train, y_train, num_epochs=40)

Epoch: 1/40 Iteration: 20 | Train loss: 0.70256
Epoch: 1/40 Iteration: 40 | Train loss: 0.69832
Epoch: 1/40 Iteration: 60 | Train loss: 0.70363
Epoch: 1/40 Iteration: 80 | Train loss: 0.68928
Epoch: 1/40 Iteration: 100 | Train loss: 0.70398
Epoch: 1/40 Iteration: 120 | Train loss: 0.71784
Epoch: 1/40 Iteration: 140 | Train loss: 0.67034
Epoch: 1/40 Iteration: 160 | Train loss: 0.67440
Epoch: 1/40 Iteration: 180 | Train loss: 0.68110
Epoch: 1/40 Iteration: 200 | Train loss: 0.67271
Epoch: 1/40 Iteration: 220 | Train loss: 0.63612
Epoch: 1/40 Iteration: 240 | Train loss: 0.66422
Epoch: 2/40 Iteration: 260 | Train loss: 0.64080
Epoch: 2/40 Iteration: 280 | Train loss: 0.59376
Epoch: 2/40 Iteration: 300 | Train loss: 0.49504
Epoch: 2/40 Iteration: 320 | Train loss: 0.54293
Epoch: 2/40 Iteration: 340 | Train loss: 0.60307
Epoch: 2/40 Iteration: 360 | Train loss: 0.40400
Epoch: 2/40 Iteration: 380 | Train loss: 0.56816
Epoch: 2/40 Iteration: 400 | Train loss: 0.48348
Epoch: 2/40 Iteration: 4

Epoch: 14/40 Iteration: 3300 | Train loss: 0.13662
Epoch: 14/40 Iteration: 3320 | Train loss: 0.16117
Epoch: 14/40 Iteration: 3340 | Train loss: 0.06032
Epoch: 14/40 Iteration: 3360 | Train loss: 0.16183
Epoch: 14/40 Iteration: 3380 | Train loss: 0.21481
Epoch: 14/40 Iteration: 3400 | Train loss: 0.27538
Epoch: 14/40 Iteration: 3420 | Train loss: 0.20015
Epoch: 14/40 Iteration: 3440 | Train loss: 0.22401
Epoch: 14/40 Iteration: 3460 | Train loss: 0.23279
Epoch: 14/40 Iteration: 3480 | Train loss: 0.14454
Epoch: 14/40 Iteration: 3500 | Train loss: 0.06925
Epoch: 15/40 Iteration: 3520 | Train loss: 0.21045
Epoch: 15/40 Iteration: 3540 | Train loss: 0.14272
Epoch: 15/40 Iteration: 3560 | Train loss: 0.16046
Epoch: 15/40 Iteration: 3580 | Train loss: 0.17974
Epoch: 15/40 Iteration: 3600 | Train loss: 0.19338
Epoch: 15/40 Iteration: 3620 | Train loss: 0.16239
Epoch: 15/40 Iteration: 3640 | Train loss: 0.11042
Epoch: 15/40 Iteration: 3660 | Train loss: 0.13929
Epoch: 15/40 Iteration: 3680 | 

Epoch: 27/40 Iteration: 6520 | Train loss: 0.08153
Epoch: 27/40 Iteration: 6540 | Train loss: 0.02092
Epoch: 27/40 Iteration: 6560 | Train loss: 0.00898
Epoch: 27/40 Iteration: 6580 | Train loss: 0.09037
Epoch: 27/40 Iteration: 6600 | Train loss: 0.05836
Epoch: 27/40 Iteration: 6620 | Train loss: 0.06412
Epoch: 27/40 Iteration: 6640 | Train loss: 0.03503
Epoch: 27/40 Iteration: 6660 | Train loss: 0.03522
Epoch: 27/40 Iteration: 6680 | Train loss: 0.09101
Epoch: 27/40 Iteration: 6700 | Train loss: 0.02003
Epoch: 27/40 Iteration: 6720 | Train loss: 0.00767
Epoch: 27/40 Iteration: 6740 | Train loss: 0.10520
Epoch: 28/40 Iteration: 6760 | Train loss: 0.06725
Epoch: 28/40 Iteration: 6780 | Train loss: 0.04734
Epoch: 28/40 Iteration: 6800 | Train loss: 0.03642
Epoch: 28/40 Iteration: 6820 | Train loss: 0.01714
Epoch: 28/40 Iteration: 6840 | Train loss: 0.01212
Epoch: 28/40 Iteration: 6860 | Train loss: 0.02874
Epoch: 28/40 Iteration: 6880 | Train loss: 0.08180
Epoch: 28/40 Iteration: 6900 | 

Epoch: 39/40 Iteration: 9740 | Train loss: 0.00269
Epoch: 40/40 Iteration: 9760 | Train loss: 0.00222
Epoch: 40/40 Iteration: 9780 | Train loss: 0.00129
Epoch: 40/40 Iteration: 9800 | Train loss: 0.00473
Epoch: 40/40 Iteration: 9820 | Train loss: 0.00124
Epoch: 40/40 Iteration: 9840 | Train loss: 0.00367
Epoch: 40/40 Iteration: 9860 | Train loss: 0.00754
Epoch: 40/40 Iteration: 9880 | Train loss: 0.05668
Epoch: 40/40 Iteration: 9900 | Train loss: 0.00582
Epoch: 40/40 Iteration: 9920 | Train loss: 0.06730
Epoch: 40/40 Iteration: 9940 | Train loss: 0.00260
Epoch: 40/40 Iteration: 9960 | Train loss: 0.03897
Epoch: 40/40 Iteration: 9980 | Train loss: 0.00242
Epoch: 40/40 Iteration: 10000 | Train loss: 0.00527


In [13]:
## Test: 
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print('Test Acc.: %.3f' % (
      np.sum(preds == y_true) / len(y_true)))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from model/sentiment-39.ckpt
Test Acc.: 0.841


In [14]:
## Get probabilities:
proba = rnn.predict(X_test, return_proba=True)

INFO:tensorflow:Restoring parameters from model/sentiment-39.ckpt


In [15]:
proba

array([2.6822090e-06, 9.9998462e-01, 8.2683563e-04, ..., 1.7203720e-06,
       1.8015897e-02, 8.8451606e-01], dtype=float32)