In [1]:
# Imports
from keras.datasets import imdb
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
# from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# Parameters
BATCH_SIZE = 128
EPOCHS = 10
NUM_WORDS = 10000

In [3]:
'''
X represents the content as word indices
y represents the label ; 0 or 1
'''
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words=NUM_WORDS)

In [4]:
# Print sample data
print("X_train sample: \n\n", X_train[1])
print("\ny_train sample: \n\n", y_train[1])
max_sequence = max([max(sequence) for sequence in X_train])
print("\n\nMax range of indices: ", max_sequence)

X_train sample: 

 [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 8255, 2, 349, 2637, 148, 605, 2, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]

y_train sample: 

 0


Max range of indices:  9999


In [5]:
word_index = imdb.get_word_index()
reversed_word_index = dict([(v,k) for (k,v) in word_index.items()])

'''
decode_review:

Expects a datapoint from X along with the reversed_word_index dictionary as args.
The word index is offset by 3 while decoding because 0,1,2 represent padding, start of seq, unknown respectively.
'''
def decode_review(datapoint, reversed_word_index):
    
    text = ' '.join([reversed_word_index.get(i-3,'?') for i in datapoint])
    return text

for i in range(1):
    print(decode_review(X_train[i], reversed_word_index))

? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

### Data Preparation

* The input training data (as seen in `X_train` sample) is a list of integers. This can't be fed as input to the Neural Net directly. There are two ways to transform the input data before feeding it to the Neuralnet:

    - One-hot encoding: Each sample would be vectorized into a <num_words> dimensional vector of 1s and 0s depending on the presence/absence of words
    - Padding each training sample to make their lengths consistent and turning them into integer tensors
* Following the first approach of One-hot encoded inputs

In [6]:
# Vectorize sequences
def vectorize_sequences(input_data, dimension=10000):
    
    vectorized_result = np.zeros((len(input_data),dimension))
    for i,content in enumerate(input_data):
        vectorized_result[i,content] = 1
    return vectorized_result

X_train_vec = vectorize_sequences(X_train)
X_test_vec = vectorize_sequences(X_test)

y_train_vec = np.asarray(y_train).astype('float32')
y_test_vec = np.asarray(y_test).astype('float32')

print("X_train_vectorized sample: \n\n", X_train_vec[1])
print("\n\ny_train label:\t", y_train[1],"\ty_train_vectorized sample:\t",y_train_vec[1])

X_val_vec = X_train_vec[:1000]
y_val_vec = y_train_vec[:1000]

X_train_vec = X_train_vec[1000:]
y_train_vec = y_train_vec[1000:]

X_train_vectorized sample: 

 [0. 1. 1. ... 0. 0. 0.]


y_train label:	 0 	y_train_vectorized sample:	 0.0


In [10]:
# Model Architecture
model = Sequential()

model.add(Dense(units=16,activation='relu',input_shape=(10000,)))
model.add(Dense(16,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [11]:
# Model compile and fit
# model.compile(optimizer=Adam(lr=0.002),loss='binary_crossentropy',metrics=['accuracy'])
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

fit = model.fit(X_train_vec,y_train_vec,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_data=(X_val_vec,y_val_vec))

Train on 24000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
eval = model.evaluate(X_test_vec,y_test_vec,batch_size=BATCH_SIZE,verbose=1)
print("Test accuracy:\t",eval[1])

Test accuracy:	 0.8474800000381469
