In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import keras
import keras.backend as K
from keras.layers import Flatten, Concatenate, Input , Embedding, LSTM, Dense, Dropout
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import Sequential
from keras.models import Model

Using TensorFlow backend.


Load IMDB dataset from keras.datasets.

"This is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative)." [[1]](https://keras.io/api/datasets/imdb/)

In [2]:
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size, skip_top=127, oov_char= 00 )

print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))
print('---review---')
print(X_train[6])
print('---label---')
print(y_train[6])

print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))

print('Minimum review length: {}'.format(
len(min((X_train + X_test), key=len))))

Loaded dataset with 25000 training samples, 25000 test samples
---review---
[0, 0, 365, 1234, 0, 1156, 354, 0, 0, 0, 0, 0, 1016, 0, 0, 356, 0, 0, 1349, 500, 746, 0, 200, 0, 4132, 0, 0, 0, 1117, 1831, 0, 0, 4831, 0, 0, 0, 4183, 0, 369, 0, 215, 1345, 143, 0, 0, 1838, 0, 1974, 0, 0, 0, 257, 0, 0, 486, 0, 0, 0, 0, 0, 271, 0, 196, 0, 949, 4121, 0, 0, 0, 0, 2212, 2436, 819, 0, 0, 0, 0, 180, 0, 227, 0, 0, 2494, 0, 0, 423, 0, 168, 0, 0, 0, 0, 0, 665, 0, 270, 0, 0, 0, 197, 0, 161, 0, 0, 0, 0, 0, 0, 419, 665, 0, 0, 0, 0, 0, 0, 2084, 0, 4773, 0, 0, 0, 1901]
---label---
1
Maximum review length: 2697
Minimum review length: 70


Change sequences length in order to be in a same size.

In [3]:
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print('shape train:',X_train.shape, 'shape test:', X_test.shape)

shape train: (25000, 500) shape test: (25000, 500)


Define layers inside the model.

In [4]:
input_seq = Input(shape=(max_words,))
embedding_size= 32
emb = Embedding(vocabulary_size, embedding_size)(input_seq)

In [5]:
input_seq

<tf.Tensor 'input_1:0' shape=(None, 500) dtype=float32>

In [6]:
emb

<tf.Tensor 'embedding_1/embedding_lookup/Identity_1:0' shape=(None, 500, 32) dtype=float32>

In [7]:
# ---------------- path one -------------------------
path_1 = LSTM(40, return_sequences= True)(emb)
print('p1:')
print(path_1)

path_1= LSTM(20, return_sequences= True)(path_1)
print('p1:')
print(path_1)

# ---------------- path two -------------------------
path_2 = LSTM(40, return_sequences= True)(emb)
print('p2:')
print(path_2)
path_2 = LSTM(20, return_sequences= True)(path_2)
print('p2:')
print(path_2)

# ---------------- concatenating -------------------------
outl = Concatenate()([path_1,path_2])
print('out:')
print(outl)

outl = Flatten()(outl)
print('out after flatten:')
print(outl)

# ---------------- final output -------------------------
outl = Dense(1, activation='softmax')(outl)
print('final out')
outl

p1:
Tensor("lstm_1/transpose_1:0", shape=(None, 500, 40), dtype=float32)
p1:
Tensor("lstm_2/transpose_1:0", shape=(None, 500, 20), dtype=float32)
p2:
Tensor("lstm_3/transpose_1:0", shape=(None, 500, 40), dtype=float32)
p2:
Tensor("lstm_4/transpose_1:0", shape=(None, 500, 20), dtype=float32)
out:
Tensor("concatenate_1/concat:0", shape=(None, 500, 40), dtype=float32)
out after flatten:
Tensor("flatten_1/Reshape:0", shape=(None, None), dtype=float32)
final out


<tf.Tensor 'dense_1/Softmax:0' shape=(None, 1) dtype=float32>

In [8]:
model= Model(inputs = input_seq, outputs = outl)
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 32)      160000      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 500, 40)      11680       embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 500, 40)      11680       embedding_1[0][0]                
____________________________________________________________________________________________

Train

In [9]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
			 
batch_size = 512
num_epochs = 10
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 24488 samples, validate on 512 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.5


In [10]:
print("accuracy:", scores[1]*100 , "\n precision:", scores[2]*100, "\n recall:", scores[3]*100)

accuracy: 50.0 
 precision: 50.0 
 recall: 100.0
