In [36]:
import keras
from tensorflow.python.keras.models import  Sequential
from tensorflow.python.keras.layers import Dense,GRU,Embedding
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [37]:
import os
path = r'data/aclImdb'

training_dire=os.path.join(path,'train')

In [38]:
x_train_text=[]
y_tra_labels=[]
for label in ['pos','neg']:
    dirr=os.path.join(training_dire,label)
    for file in os.listdir(dirr):
        if file[5:] =='.txt':
            f=open(os.path.join(dirr,file))
            x_train_text.append(f.read())
            f.close()
            if label=='pos':
                y_tra_labels.append(1)
            else:
                y_tra_labels.append(0)
            

In [39]:
len(x_train_text)

1432

In [5]:
# loading the test data  
x_test_text=[]
y_test_text=[]
test_dire=os.path.join(path,'test')
for label in ['neg','pos']:
    dire=os.path.join(test_dire,label)
    for files in os.listdir(dire):
        if files[4:]=='.txt':
            f=open(os.path.join(dire,files))
            x_test_text.append(f.read())
            f.close()
            if label=='pos':
                y_test_text.append(1)
            else:
                y_test_text.append(0)
            

In [6]:
len(x_test_text)

139

In [7]:
# number of words to consider and this will based on their frequency
# so it will  be the first 10000 most frequent words 
num_words=10000


In [8]:
# creating a tokenization object
tokenizer=Tokenizer(num_words=num_words)

In [9]:
%%time
# this is to build a vocabulary by assigning a unique integer to the 
# set of words in the vocabulary
tokenizer.fit_on_texts(x_train_text + x_test_text)
# convert x_test_text in integer tokens


Wall time: 365 ms


In [10]:
# frequnecy of words
tokenizer.word_counts

OrderedDict([('scott', 41),
             ("bartlett's", 2),
             ("'offon'", 2),
             ('is', 6721),
             ('nine', 12),
             ('minutes', 167),
             ('of', 9036),
             ('pure', 32),
             ('craziness', 2),
             ('it', 4827),
             ('a', 10459),
             ('full', 107),
             ('frontal', 6),
             ('assault', 5),
             ('psychedelic', 5),
             ('pulsating', 1),
             ('epilepsy', 1),
             ('inducing', 2),
             ('flashing', 4),
             ('lights', 6),
             ('and', 10300),
             ('colours', 3),
             ('the', 21252),
             ('first', 577),
             ('true', 140),
             ('merging', 5),
             ('film', 2544),
             ('video', 91),
             ('in', 6157),
             ('avante', 2),
             ('garde', 3),
             ('cinema', 112),
             ("there's", 181),
             ('no', 873),
             ('story

In [11]:
# number of tokens
len(tokenizer.word_index)

22614

In [12]:
# convert the words in the text to integers
x_train_integer_tokens=tokenizer.texts_to_sequences(x_train_text)
x_test_integer_tokens=tokenizer.texts_to_sequences(x_test_text)

In [13]:
x_train_text[1]

"IMDb lists this as 1972 for some reason, but the other sources I've seen including the excellent program notes mark it as '68. Doesn't really matter, except that it's quite interesting to watch this abstract collage of film and video (one of the first art works to merge the two apparently) in the context of the Star Gate sequence in 2001, released the same year. Pure abstraction isn't really my thing, but I can take it in small doses and the super-saturated optically printed colors and psychedelic feel of this series of flowers, Rohrschach blots, birds, etc is pretty compelling and quite beautiful. Certainly helped paved the way for many other nascent video artists in the 70s, and deserves to be better known."

In [19]:
# sequence with the maximum lenght
max_train_tokens=max([len(seq)  for seq in x_train_integer_tokens])
max_test_tokens=max([len(seq)  for seq in x_test_integer_tokens])

In [20]:
max_test_tokens

978

In [22]:

# padding means adding zeros to the sequence
#  and can pad either pre or pos
x_train_pad=pad_sequences(x_train_integer_tokens,max_train_tokens,
                          padding="pre")
x_test_pad=pad_sequences(x_test_integer_tokens,max_test_tokens,padding="pre")

# creating RNN MODEL

In [23]:
model=Sequential()

In [25]:
embedding_size=8
model.add(Embedding(input_dim=num_words,output_dim=embedding_size,
                    input_length=max_train_tokens,name='embedding_layer'))

In [26]:
# first gru layer
# since we are passing all the sequnce in this layer to the next
#so we must set return_sequences=True
model.add(GRU(units=32,return_sequences=True))

In [27]:
# second gru layer
model.add(GRU(units=16,return_sequences=True))

In [28]:
# 3nd layer 
model.add(GRU(units=8,return_sequences=True))

In [29]:
# this will be connected to a dense layer
model.add(GRU(units=4))

In [30]:
# we are using sigmoid since we are dealing with a binary classification
# problem 
model.add(Dense(1,activation='sigmoid'))

In [31]:
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-4),
             metrics=['accuracy'])

In [32]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 991, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 991, 32)           3936      
_________________________________________________________________
gru_1 (GRU)                  (None, 991, 16)           2352      
_________________________________________________________________
gru_2 (GRU)                  (None, 991, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 87,049
Trainable params: 87,049
Non-trainable params: 0
_________________________________________________________________


In [33]:
%%time
# fitting the model on the data
model.fit(x_train_pad,y_tra_labels,epochs=10,batch_size=75,
          validation_split=0.02)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1403 samples, validate on 29 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 5min 20s


<tensorflow.python.keras.callbacks.History at 0x6be5f57828>

In [None]:
%%time
# evaluating the model o the test set
result=model.evaluate(x_test_pad,y_test_text)

In [None]:
print('accuracy: {0:.2%}'.format(result[1]))