In [1]:
import re
import random
import csv
from string import punctuation
from collections import Counter
import numpy as np
import tensorflow as tf

### Dataset 
I used dataset from kaggle as suggested. https://www.kaggle.com/bittlingmayer/amazonreviews
you can have a look.


In [2]:
# in this dataset label2=="positive", label1=="negative"
f = open('amazonreviews/data.txt','r')

In [3]:
data = f.read().split('\n')
len(data)

400001

In [4]:
reviews = []
sentiments = []

In [5]:
for i in data:
    if i.startswith('__label__2') == True:
        reviews.append(i[11:])
        sentiments.append('positive')
    elif i.startswith('__label__1') == True:
        reviews.append(i[11:])
        sentiments.append('negative')
        

In [6]:
len(reviews)

400000

In [7]:
len(sentiments)

400000

In [8]:
X_train = reviews
y_train = sentiments

In [9]:
len(X_train)

400000

In [10]:
x_train = []
for i in X_train:
    all_text = ''.join([c for c in i if c not in punctuation])
    x_train.append(all_text)
whole_corpus = ' '.join([c for c in x_train])

In [11]:
words = whole_corpus.split()

In [12]:
whole_corpus[:2000]

'Great CD My lovely Pat has one of the GREAT voices of her generation I have listened to this CD for YEARS and I still LOVE IT When Im in a good mood it makes me feel better A bad mood just evaporates like sugar in the rain This CD just oozes LIFE Vocals are jusat STUUNNING and lyrics just kill One of lifes hidden gems This is a desert isle CD in my book Why she never made it big is just beyond me Everytime I play this no matter black white young old male female EVERYBODY says one thing Who was that singing  One of the best game music soundtracks  for a game I didnt really play Despite the fact that I have only played a small portion of the game the music I heard plus the connection to Chrono Trigger which was great as well led me to purchase the soundtrack and it remains one of my favorite albums There is an incredible mix of fun epic and emotional songs Those sad and beautiful tracks I especially like as theres not too many of those kinds of songs in my other video game soundtracks I

In [13]:
words[:20]

['Great',
 'CD',
 'My',
 'lovely',
 'Pat',
 'has',
 'one',
 'of',
 'the',
 'GREAT',
 'voices',
 'of',
 'her',
 'generation',
 'I',
 'have',
 'listened',
 'to',
 'this',
 'CD']

In [14]:
# total number of words in the corpus
len(words)

31251233

## Lets create dictionaries
first lets sort the words according to their repetitions in the corpus and assign the integers to them. 

In [15]:
counts = Counter(words)

In [16]:
vocab = sorted(counts, key=counts.get, reverse=True)

In [17]:
vocab_to_int = { word: ii for ii, word in enumerate(vocab,1)}

In [18]:
# conversion of training set into their integer representations.
# x_train_int stores the integer representation of the each training data 
x_train_int = []
for each in x_train:
    x_train_int.append([vocab_to_int[word] for word in each.split()])

In [19]:
print(len(x_train_int))

400000


### Lets encode the labels
encoding labels means converting to 1 and 0 since we only have two classsification so we will be using 0 and 1

In [20]:
# Convert labels to 1s and 0s for 'positive' and 'negative'
labels = np.array([1 if i == "positive" else 0 for i in y_train])

In [22]:
len(labels)

400000

In [23]:
# visualize the maximum length review
review_len = Counter(len(i) for i in x_train_int)
print('the maximum length review is: {}'.format(max(review_len)))
print('the minimum length review is : {}'.format(min(review_len)))

the maximum length review is: 230
the minimum length review is : 6


since there is variation in the lenth of the training data is varying from minimum 6 to the maximum upto 230 so we need to fix a size for the network. Lets consider 200.
For data with length greater than 200 we will truncate but for length less than 200 we will do zero padding.

In [24]:
#lets make our data nicer by fixing max length of the training data to 200
x_train_int = [r[0:200] for r in x_train_int if len(r) > 0]

In [25]:
review_len = Counter(len(i) for i in x_train_int)
print('the maximum length review is: {}'.format(max(review_len)))
print('the minimum length review is : {}'.format(min(review_len)))

the maximum length review is: 200
the minimum length review is : 6


In [26]:
## the features will store the proper data since it has got the proper length for every training data
seq_len = 200
features = np.zeros((len(x_train_int),seq_len),dtype=int)
for i, row in enumerate(x_train_int):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [27]:
features[:10,:100]

array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,    140,
           136,    130,   1936,   6293],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,

In [28]:
print(len(features))

400000


## Creating Training and Test data
lets split the training and test data

In [29]:
split_ratio = 0.8 
train_size = int(0.8 * len(features))

In [30]:
train_size

320000

In [31]:
train_x, val_x = features[:train_size], features[train_size:] 

In [32]:
train_y, val_y = labels[:train_size], labels[train_size:]

lets divide 0.5 to test and validation

In [33]:
split_frac = 0.5
split_index = int(split_frac * len(val_x))
val_x, test_x = val_x[:split_index], val_x[split_index:]
val_y, test_y = val_y[:split_index], val_y[split_index:]

In [34]:
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
print("label set: \t\t{}".format(train_y.shape), 
      "\nValidation label set: \t{}".format(val_y.shape),
      "\nTest label set: \t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(320000, 200) 
Validation set: 	(40000, 200) 
Test set: 		(40000, 200)
label set: 		(320000,) 
Validation label set: 	(40000,) 
Test label set: 	(40000,)


Lets build the lstm model 
for that we gonna select hidden layers 256 (normally 128,256,512)

In [35]:
## number of unique words in the vocabulary 
no_words = len(vocab_to_int) + 1
print(no_words)

599071


## Creating tensorflow graph here

In [36]:
#creating the graph object
## keep_prob is required to implement dropout over the lstm cell
tf.reset_default_graph()
## Here tf.name_scope is to tell that its under the hood of default graph
with tf.name_scope('inputs'):
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")

### Embedding layer here
SInce here is the total number of vocabs is about 60,000 we gonna use the dimension 300

In [37]:
embed_size = 300 
with tf.name_scope("Embeddings"):
    embedding = tf.Variable(tf.random_uniform((no_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

## lets construct LSTM cell network now 

In [38]:
lstm_size = 256
lstm_layers = 1
batch_size = 1000
learning_rate = 0.01

In [39]:
def lstm_cell():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # adding the dropout to the cell 
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

In [40]:
with tf.name_scope("RNN_layers"):
    # Stacking up multiple LSTM layers
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

## forward pass

In [42]:
with tf.name_scope("RNN_forward"):
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

## Prediction

In [43]:
with tf.name_scope('predictions'):
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)

## Cost

In [44]:
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(labels_, predictions)

## Gradient Descent and optimization

In [45]:
with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [46]:
### lets check the accuracy of the work 
# validation 
with tf.name_scope('validation'):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [47]:
# lets do batching 
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training goes here 

In [48]:
## we consider 10 epochs 
epochs = 10
## to later load our models we save it
saver = tf.train.Saver()

with tf.Session() as sess:
    ## initializing all the tensorflow variables
    sess.run(tf.global_variables_initializer())

    iteration = 1
    for e in range(epochs):
        ## with initial state we put everywhere 0's in the network 
        state = sess.run(initial_state)
        ## Now get training data in batches
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            ## losses and states
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            ## after every 5 iterations print the losses 
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))
            ## after every 25 iterations print the accuracy
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1

            saver.save(sess, "checkpoints/sentiment.ckpt")
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.254
Epoch: 0/10 Iteration: 10 Train loss: 0.212
Epoch: 0/10 Iteration: 15 Train loss: 0.160
Epoch: 0/10 Iteration: 20 Train loss: 0.178
Epoch: 0/10 Iteration: 25 Train loss: 0.155
Val acc: 0.793
Epoch: 0/10 Iteration: 30 Train loss: 0.159
Epoch: 0/10 Iteration: 35 Train loss: 0.143
Epoch: 0/10 Iteration: 40 Train loss: 0.149
Epoch: 0/10 Iteration: 45 Train loss: 0.130
Epoch: 0/10 Iteration: 50 Train loss: 0.129
Val acc: 0.828
Epoch: 0/10 Iteration: 55 Train loss: 0.122
Epoch: 0/10 Iteration: 60 Train loss: 0.125
Epoch: 0/10 Iteration: 65 Train loss: 0.121
Epoch: 0/10 Iteration: 70 Train loss: 0.108
Epoch: 0/10 Iteration: 75 Train loss: 0.092
Val acc: 0.875
Epoch: 0/10 Iteration: 80 Train loss: 0.086
Epoch: 0/10 Iteration: 85 Train loss: 0.093
Epoch: 0/10 Iteration: 90 Train loss: 0.086
Epoch: 0/10 Iteration: 95 Train loss: 0.069


KeyboardInterrupt: 

## Complexity.
It took significantly long time to complete the training so i had to interupt in the middle. However following accuracy test has been given on the trained data. 

In [49]:
## TIme to check for our testing accuracy. 
test_accuracy = []
with tf.Session() as sess:
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_accuracy.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_accuracy)))

INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Test accuracy: 0.889
