In [1]:
# load packages
import os
import sys
import re
import numpy as np
import tensorflow as tf
print(tf.__version__)

1.12.0-rc0


In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

In [3]:
print("Train:", x_train.shape, y_train.shape)
print("Test:", x_test.shape, y_test.shape)

Train: (25000,) (25000,)
Test: (25000,) (25000,)


In [4]:
vocab = tf.keras.datasets.imdb.get_word_index()

In [5]:
len(vocab)

88584

In [6]:
keys = list(vocab.keys())
values = list(vocab.values())
# get a key-word mapping
reverse_vocab = dict()
for i in range(len(vocab)):
    reverse_vocab[values[i]] = keys[i]

In [7]:
len(reverse_vocab)

88584

In [8]:
# implement method to get back the original text
def get_original_text(vector):
    text = list()
    for v in vector:
        if v in reverse_vocab:
            text.append(reverse_vocab[v])
        else:
            continue
    return " ".join(text)
# call
# get_original_text(x_train[0])

In [9]:
# get training reviews
train_text = list()
for x in x_train:
    train_text.append(get_original_text(x))

In [10]:
# get test reviews
test_text = list()
for x in x_test:
    test_text.append(get_original_text(x))

In [11]:
# create word frequency mapping
word_frequency_mapping = dict()
def get_word_frequency(text_list):
    for text in text_list:
        text = re.sub("[^a-zA-Z0-9 ]", "", text)
        tokens = text.split()
        for tok in tokens:
            if tok not in word_frequency_mapping:
                # if not in vocab, add new word
                word_frequency_mapping[tok] = 1
            else:
                # if word present in vocab, update frequency
                word_frequency_mapping[tok] += 1

In [12]:
# call method
get_word_frequency(text_list=train_text)

In [13]:
len(word_frequency_mapping)

79341

In [14]:
# reverse the dict based on the values
sorted_word_frequency_mapping = sorted(word_frequency_mapping.items(), key=lambda x: x[1], reverse=True)
len(sorted_word_frequency_mapping)

79341

In [15]:
# create a vocabulary for the embedding
# add word for unknown words and for padding
word2idx = {"<PAD>":0, "<UNK>": 1}
for i in range(len(sorted_word_frequency_mapping)):
    word2idx[sorted_word_frequency_mapping[i][0]] = i+2
# check for the lenght of the vocab
len(word2idx)

79343

In [16]:
# get a mapping from index to word
idx2word = {val:key for key, val in word2idx.items()}
len(idx2word)

79343

In [17]:
chars = """abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,_!."""
chars = [a for a in chars]
len(chars)

67

In [18]:
char2idx = {"<UNK>": 1, "<PAD>": 0}
for i in range(len(chars)):
    char2idx[chars[i]] = i+2
len(char2idx)

69

In [19]:
# get a mapping from index to word
idx2char = {val:key for key, val in char2idx.items()}
len(idx2char)

69

In [20]:
# create character based codes for each title name
def get_char_codes(text_list):
    x_char = list()
    max_len = 256
    max_char_len = 8
    for text in text_list:
        text = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", text)
        tokens = text.split()
        total_token = list()
        for k in range(max_len):
            word_seq = list()
            for j in range(max_char_len):
                try:
                    word_seq.append(char2idx[tokens[k][j]])
                except:
                    word_seq.append(char2idx["<PAD>"])
            total_token.append(word_seq)
        x_char.append(total_token)
    return x_char

In [21]:
# generate char codes for both train and test reviews
train_char = get_char_codes(text_list=train_text)
test_char = get_char_codes(text_list=test_text)

In [22]:
# create word based codes for each title name
def get_word_codes(text_list):
    x_word = list()
    for text in text_list:
        text = re.sub("[^a-zA-Z0-9,-;.!?:’’’/\|_@#$%ˆ&*˜‘+-=' ]", "", text)
        tokens = text.split()
        total_tokens=  list()
        for tok in tokens:
            if tok in word2idx.keys():
                total_tokens.append(word2idx[tok])
            else:
                total_tokens.append(word2idx["<UNK>"])
        x_word.append(total_tokens)
    return x_word

In [23]:
# generate word codes for both train and test reviews
train_word = get_word_codes(text_list=train_text)
test_word = get_word_codes(text_list=test_text)

In [24]:
train_word = tf.keras.preprocessing.sequence.pad_sequences(train_word, maxlen=256, padding="post", truncating="post")
test_word = tf.keras.preprocessing.sequence.pad_sequences(test_word, maxlen=256, padding="post", truncating="post")

In [25]:
train_word = np.array(train_word).reshape((len(train_word), 256))
test_word = np.array(test_word).reshape((len(test_word), 256))
train_char = np.array(train_char).reshape((len(train_char), 256, 8))
test_char = np.array(test_char).reshape((len(test_char), 256, 8))
y_train = np.array(y_train).reshape((len(y_train), 1))
y_test = np.array(y_test).reshape((len(y_test), 1))

In [26]:
train_word.shape, test_word.shape

((25000, 256), (25000, 256))

In [27]:
train_char.shape,  test_char.shape

((25000, 256, 8), (25000, 256, 8))

In [28]:
y_train.shape, y_test.shape
# train_label.shape, test_label.shape

((25000, 1), (25000, 1))

In [29]:
# input word tensor
word_in = tf.keras.Input(shape=(256, ))
emb_word = tf.keras.layers.Embedding(input_dim=len(word2idx)+2, output_dim=64, input_length=256)(word_in)

# input char tensor
char_in = tf.keras.Input(shape=(256, 8, ))
emb_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Embedding(input_dim=len(char2idx)+2, output_dim=32, input_length=8))(char_in)

# LSTM to get word encodings by character
char_enc = tf.keras.layers.TimeDistributed(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=512, recurrent_dropout=0.4)))(emb_char)

# main BiLSTM block
merged = tf.keras.layers.concatenate([emb_word, char_enc])

# add another BiLSTM for ner task
main_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128))(merged)

# add a time distribute layer to work it in parallel
out = tf.keras.layers.Dense(units=2, activation="softmax")(main_lstm)

# set the model together
model = tf.keras.Model([word_in, char_in], out)

In [30]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [31]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 256, 8)       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 256, 8, 32)   2272        input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 256, 64)      5078080     input_1[0][0]                    
__________________________________________________________________________________________________
time_distr

In [32]:
# model callback config
cbk = [
    tf.keras.callbacks.ModelCheckpoint(filepath='imdb_model.weights.best.hdf5', verbose = 1, save_best_only=True, save_weights_only=False),
    tf.keras.callbacks.EarlyStopping(patience=3)
]

In [33]:
model.fit([train_word, train_char], y_train, batch_size=32, epochs=20, verbose=1, validation_data=([test_word, test_char], y_test), callbacks=cbk)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.44937, saving model to imdb_model.weights.best.hdf5
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.44937
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.44937
Epoch 4/20
Epoch 00004: val_loss did not improve from 0.44937


<tensorflow.python.keras.callbacks.History at 0x7f803dd17a90>