In [6]:
from keras.layers import Embedding
from keras.datasets import imdb
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.callbacks import EarlyStopping
import os
import numpy as np

- Embedding Layer
    - Accepts 2D Tensor input: <`num_samples`, `sequence_length`>
    - `sequence_length` should be of fixed (same) length per batch of `samples`
    - Initialized with random weights (internal dictionary of token weights), would get optimized through `backprop`

In [2]:
max_len = 20
max_features = 10000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
# X_train[:5]

In [3]:
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

In [4]:
model = Sequential()

model.add(Embedding(max_features,8,input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [5]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/20
 - 3s - loss: 0.6651 - acc: 0.6228 - val_loss: 0.6016 - val_acc: 0.7052
Epoch 2/20
 - 1s - loss: 0.5009 - acc: 0.7820 - val_loss: 0.5094 - val_acc: 0.7440
Epoch 3/20
 - 1s - loss: 0.3895 - acc: 0.8388 - val_loss: 0.4942 - val_acc: 0.7498
Epoch 4/20
 - 1s - loss: 0.3155 - acc: 0.8804 - val_loss: 0.5008 - val_acc: 0.7482
Epoch 5/20
 - 1s - loss: 0.2558 - acc: 0.9113 - val_loss: 0.5174 - val_acc: 0.7482
Epoch 6/20
 - 1s - loss: 0.2065 - acc: 0.9347 - val_loss: 0.5428 - val_acc: 0.7430
Epoch 7/20
 - 1s - loss: 0.1653 - acc: 0.9557 - val_loss: 0.5696 - val_acc: 0.7392
Epoch 8/20
 - 1s - loss: 0.1317 - acc: 0.9689 - val_loss: 0.6048 - val_acc: 0.7372
Epoch 9/20
 - 2s - loss: 0.1045 - acc: 0.9784 - val_loss: 0.6395 - val_acc: 0.7350
Epoch 10/20
 - 1s - loss: 0.0827 - acc: 0.9866 - val_loss: 0.6774 - val_acc: 0.7306
Epoch 11/20
 - 1s - loss: 0.0647 - acc: 0.9908 - val_loss: 0.7194 - val_acc: 0.7318
Epoch 12/20
 - 1s - loss: 0.0505 - a

- Above network `flatten`s the embedded input (3D Float Tensor) before feeding it to the `Dense` layer. This essentially doesn't contain any sequential information, which means no context about inter-word semantics is used for training the network
- 1D Convnets or RNNs consider the sematinc information

### Using pre-trained word embeddings

In [16]:
def load_imdb_raw_data(imdb_dir):
    
    texts = []; labels = []
    
    train_dir = os.path.join(imdb_dir, 'train')
    
    for label_type in ['pos', 'neg']:
        files_dir = os.path.join(train_dir, label_type)
        
        for fname in os.listdir(files_dir):
            if fname[-4:] == '.txt':
         
                f = open(os.path.join(files_dir,fname))
                texts.append(f.read())
                f.close()
                
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)
    
    return (texts, labels)

In [17]:
# os.getcwd()
texts, labels = load_imdb_raw_data('../data/aclImdb/aclImdb/')
print("texts: {} labels: {}".format(len(texts), len(labels)))

texts: 25000 labels: 25000


In [18]:
max_len = 100

train_samples = 20000
validation_samples = 5000

tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
labels = np.asarray(labels)

print("data: {} labels: {}".format(data.shape, labels.shape))

data: (25000, 100) labels: (25000,)


In [19]:
# shuffle data indexes to shuffle pos/neg

indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]

In [20]:
X_train = data[:train_samples]
y_train = labels[:train_samples]

X_test = data[train_samples:train_samples+validation_samples]
y_test = labels[train_samples:train_samples+validation_samples]

In [21]:
def create_glove_based_dict(path='../data/glove.6B/', file='glove.6B.100d.txt'):
    
    embedding_index = {}
    f = open(os.path.join(path,file))
    
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32') # word vector representation
        embedding_index[word] = coefs # assigned to the word/token at the index i of dictionary (i - when enumerated)
    
    f.close()
    print("Found {} word vectors".format(len(embedding_index)))

    return embedding_index

In [22]:
embedding_index = create_glove_based_dict()

Found 400000 word vectors


In [23]:
len(embedding_index['another']) # 100-d vector representation of each word in the vocabulary

100

In [24]:
embedding_index['movie']

array([ 0.38251  ,  0.14821  ,  0.60601  , -0.51533  ,  0.43992  ,
        0.061053 , -0.62716  , -0.025385 ,  0.1643   , -0.22101  ,
        0.14423  , -0.37213  , -0.21683  , -0.08895  ,  0.097904 ,
        0.6561   ,  0.64455  ,  0.47698  ,  0.83849  ,  1.6486   ,
        0.88922  , -0.1181   , -0.012465 , -0.52082  ,  0.77854  ,
        0.48723  , -0.014991 , -0.14127  , -0.34747  , -0.29595  ,
        0.1028   ,  0.57191  , -0.045594 ,  0.026443 ,  0.53816  ,
        0.32257  ,  0.40788  , -0.043599 , -0.146    , -0.48346  ,
        0.32036  ,  0.55086  , -0.76259  ,  0.43269  ,  0.61753  ,
       -0.36503  , -0.60599  , -0.79615  ,  0.3929   , -0.23668  ,
       -0.34719  , -0.61201  ,  0.54747  ,  0.94812  ,  0.20941  ,
       -2.7771   , -0.6022   ,  0.8495   ,  1.2549   ,  0.017893 ,
       -0.041901 ,  2.1147   , -0.026618 , -0.28104  ,  0.68124  ,
       -0.14165  ,  0.99249  ,  0.49879  , -0.67538  ,  0.6417   ,
        0.42303  , -0.27913  ,  0.063403 ,  0.68909  , -0.3618

In [25]:
word_index # word_index created by the tokenizer based on the imdb texts

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'has': 44,
 'if': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [26]:
embedding_dim = 100

embedding_matrix = np.zeros((max_features, embedding_dim))

for word, idx in word_index.items():
    
    if idx < max_features:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

In [27]:
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 320,065
Non-trainable params: 1,000,000
_________________________________________________________________


In [30]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(X_train, y_train, epochs=20,batch_size=32,validation_data=(X_test, y_test))
model.save_weights('pre_trained_glove_model.h5')

Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
