In [1]:
import pandas as pd
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
data = pd.read_csv("imdb_master.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [5]:
dictionary = {'neg' : 0 , 'pos' : 1}
data = data.replace({"label" : dictionary})
data.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,0,0_2.txt
1,1,test,This is an example of why the majority of acti...,0,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",0,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,0,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,0,10003_3.txt


In [6]:
train = data.loc[data['type'] == 'train']
test = data.loc[data['type'] == 'test']

In [7]:
x_train = train['review']
y_train = train['label']

x_test = test['review']
y_test = test['label']

x_train = x_train[:25000]
y_train = y_train[:25000]

In [8]:
x_train = np.array(x_train)
y_train = np.array(y_train)

x_test = np.array(x_test)
y_test = np.array(y_test)

In [9]:
(x_train.shape, y_train.shape) , (x_test.shape , y_test.shape)

(((25000,), (25000,)), ((25000,), (25000,)))

### Cleaning the tweets

In [10]:
import sentence_cleaner

x_train_list = []
x_test_list = []

for tweet in x_train :
    x_train_list.append( sentence_cleaner.tweet_cleaner(tweet) )

for tweet in x_test :
    x_test_list.append( sentence_cleaner.tweet_cleaner(tweet) )
    
x_train = np.array(x_train_list)
x_test = np.array(x_test_list)

  if input[:3] == codecs.BOM_UTF8:


In [11]:
y_train = y_train.astype('float64')
y_test = y_test.astype('float64')

In [12]:
print (y_train.dtype)

float64


In [13]:
x_train[0]

u'story of man who has unnatural feelings for pig starts out with opening scene that is terrific example of absurd comedy formal orchestra audience is turned into an insane violent mob by the crazy chantings of it singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to third grader on technical level it better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly'

In [14]:
y_train[13456]

1.0

In [15]:
x_train.shape , x_test.shape

((25000,), (25000,))

In [16]:
%%time
data_text = np.concatenate((x_train, x_test), axis=0)

CPU times: user 584 ms, sys: 1.28 s, total: 1.87 s
Wall time: 24 s


In [17]:
data_text.shape

(50000,)

In [18]:
num_words = 10000

In [19]:
tokenizer = Tokenizer(num_words=10000)

In [20]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 31.1 s, sys: 120 ms, total: 31.2 s
Wall time: 33.3 s


In [21]:
# Saving the tokenizer

import pickle 

with open('tokenizer.pickle', 'wb') as handle :
    pickle.dump( tokenizer , handle , protocol=pickle.HIGHEST_PROTOCOL )

In [22]:
%%time
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)

CPU times: user 17.8 s, sys: 124 ms, total: 17.9 s
Wall time: 17.9 s


In [23]:
len(x_train_tokens[0])

97

In [24]:
len(x_train[0].split())

105

In [25]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [26]:
np.mean(num_tokens)

205.33826

In [27]:
np.max(num_tokens)

2098

In [28]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

509

In [29]:
(np.sum(num_tokens < max_tokens) * 1.0) / len(num_tokens)

0.9462

In [30]:
pad = 'pre'

In [31]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [32]:
x_test_pad  = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [32]:
x_train_pad.shape , x_test_pad.shape

((25000, 509), (25000, 509))

In [33]:
x_train_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [34]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
forward_map = dict(zip(inverse_map.values() , inverse_map.keys()))

In [35]:
model = Sequential()

In [36]:
embedding_size = 8

In [37]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [38]:
model.add(GRU(units=16, return_sequences=True))

In [39]:
model.add(GRU(units=8, return_sequences=True))

In [40]:
model.add(GRU(units=4))

In [41]:
model.add(Dense(1, activation='sigmoid'))

In [42]:
optimizer = Adam(lr=1e-3)

In [43]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [44]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 509, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 509, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 509, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [46]:
import time

start = time.clock() 

model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64,verbose = 1)

end = time.clock() 

print ("Time taken is " + str(end-start))

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Time taken is 2690.392088


In [54]:
model.save("model_new.h5")

In [51]:
%%time
result = model.evaluate(x_test_pad, y_test)

CPU times: user 3min 41s, sys: 29.6 s, total: 4min 11s
Wall time: 1min 32s


In [53]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 84.83%


In [56]:
from keras.models import load_model

In [57]:
new_model = load_model("model_new.h5")

In [58]:
new_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________
