In [0]:
import numpy as np
import tensorflow as tf

In [0]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [3]:
reviews[:2000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

## Data preprocessing



In [0]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

### Encoding the words



In [0]:
# Create your dictionary that maps vocab words to integers here
from collections import Counter
words_count = Counter(words)
vocab = sorted(words_count, key=words_count.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

# Convert the reviews to integers, same shape as reviews list, but with integers
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

### Encoding the labels



In [0]:
labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

In [7]:
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [8]:
# Filter out that review with 0 length
reviews_ints = list(filter(lambda x: len(x)!=0, reviews_ints))
print(len(reviews_ints))

25000


In [9]:
seq_len = 200
features = np.array([np.pad(x, (seq_len-len(x),0), 'constant') if seq_len-len(x)>0
            else x[:seq_len] for x in reviews_ints])
print(len(features))
print(features[0])

25000
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 21025   308     6     3  1050   207     8  2138    32     1   171    57
    15    49    81  5785    44   382   110   140    15  5194    60   154
     9     1  4975  5852   475    71     5   260    12 21025   308    13
  1978     6    74  2395     5   613    73     6  5194     1 24103     5
  1983 10166     1  5786  1499    36    51    66   204   145    67  1199
  5194 19869     1 37442     4     1   221   883    31  2988    71     4
     1  5787    10   686     2    67  1499    54    10   216     1   383
     9    62     3  1406  3686   783     5  3483   180     1   382    10
  1212 13583    32   308     3   349   341  2

In [10]:
features[:10,:100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21025,   308,     6,
            3,  1050,   207,     8,  2138,    32,     1,   171,    57,
           15,    49,    81,  5785,    44,   382,   110,   140,    15,
         5194,    60,   154,     9,     1,  4975,  5852,   475,    71,
            5,   260,    12, 21025,   308,    13,  1978,     6,    74,
         2395],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

## Training, Validation, Test



In [11]:
print(int(len(features)*0.8))

20000


In [12]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


## Build the graph


In [0]:
lstm_size = 256
lstm_layers = 2
batch_size = 500
learning_rate = 0.001

In [0]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob = tf.placeholder(tf.float32)

### Embedding


In [0]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

### LSTM cell


In [0]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    def lstm_cell():
      lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
      drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
      return drop 
    
    cell = tf.contrib.rnn.MultiRNNCell(
      [lstm_cell() for _ in range(lstm_layers)])

    initial_state = state = cell.zero_state(batch_size, tf.float32)

### RNN forward pass



In [0]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, 
                                             embed, 
                                             initial_state=initial_state)

### Output


In [30]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

W0806 13:17:07.657267 139719125829504 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/losses/losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


### Validation accuracy


In [0]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### Batching



In [0]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training


In [33]:
epochs = 20

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/20 Iteration: 5 Train loss: 0.238
Epoch: 0/20 Iteration: 10 Train loss: 0.226
Epoch: 0/20 Iteration: 15 Train loss: 0.216
Epoch: 0/20 Iteration: 20 Train loss: 0.236
Epoch: 0/20 Iteration: 25 Train loss: 0.207
Val acc: 0.670
Epoch: 0/20 Iteration: 30 Train loss: 0.202
Epoch: 0/20 Iteration: 35 Train loss: 0.199
Epoch: 0/20 Iteration: 40 Train loss: 0.183
Epoch: 1/20 Iteration: 45 Train loss: 0.166
Epoch: 1/20 Iteration: 50 Train loss: 0.195
Val acc: 0.773
Epoch: 1/20 Iteration: 55 Train loss: 0.163
Epoch: 1/20 Iteration: 60 Train loss: 0.189
Epoch: 1/20 Iteration: 65 Train loss: 0.160
Epoch: 1/20 Iteration: 70 Train loss: 0.233
Epoch: 1/20 Iteration: 75 Train loss: 0.176
Val acc: 0.744
Epoch: 1/20 Iteration: 80 Train loss: 0.165
Epoch: 2/20 Iteration: 85 Train loss: 0.150
Epoch: 2/20 Iteration: 90 Train loss: 0.146
Epoch: 2/20 Iteration: 95 Train loss: 0.155
Epoch: 2/20 Iteration: 100 Train loss: 0.151
Val acc: 0.786
Epoch: 2/20 Iteration: 105 Train loss: 0.190
Epoch: 2/20 Ite

## Testing

In [34]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

W0806 13:22:52.816085 139719125829504 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


Test accuracy: 0.784
