# Embeddings

In [None]:
import os
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import pickle

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, GRU, Dense, MaxPooling1D, Conv1D, Dropout
from keras.initializers import Constant
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.models import load_model
from keras import regularizers

In [None]:
def load_pretrained_word_vectors():
    """ Load pre-trained GLoVe vectors """

    print('[INFO] Loading word vectors...')
    embeddings_index = {}
    with open('../../data/glove.6B.50d.txt') as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs
    print('[INFO] Found {:,} word vectors.'.format(len(embeddings_index)))
    return embeddings_index

In [None]:
data_path = '../../data/'

In [None]:
X_text_train = np.load(os.path.join(data_path, 'X_text_train.npy'), allow_pickle=True)
X_text_val = np.load(os.path.join(data_path, 'X_text_val.npy'), allow_pickle=True)
y_text_train = np.load(os.path.join(data_path, 'y_text_train.npy'), allow_pickle=True)
y_text_val = np.load(os.path.join(data_path, 'y_text_val.npy'), allow_pickle=True)

In [None]:
train_captions = X_text_train.tolist()

In [None]:
train_captions[0]

In [None]:
caption_words = list(set([ word for caption in train_captions for word in caption.split() ]))
print('[INFO] {:,} words in the dev captions'.format(len(caption_words)))

In [None]:
# VECTORIZATION

print('[INFO] Vectorize the captions into a 2D integer tensor...')
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_captions)
# # Save Tokenizer
# with open(config.CAPTIONS_EMBEDDINGS_TOKENIZER, 'wb') as f:
#     pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Sequences
train_sequences = tokenizer.texts_to_sequences(train_captions)

In [None]:
# Padding
word_index = tokenizer.word_index
MAX_NUM_WORDS = len(word_index)
MAX_SEQUENCE_LENGTH = max([ len(caption.split()) for caption in train_captions ])
# # Save
# with open(config.CAPTIONS_MAX_SEQUENCE_LENGTH, 'w') as f:
#     f.write(str(MAX_SEQUENCE_LENGTH))

In [None]:
# X and Y TRAIN
X_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_train = np.array(y_text_train)

In [None]:
# X and Y validation
print('[INFO] Preprocessing validation captions...')
val_captions = X_text_val.tolist()
validation_sequences = tokenizer.texts_to_sequences(val_captions)
X_val = pad_sequences(validation_sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_val = np.array(y_text_val)

In [None]:
print('[INFO] Loading word vectors')
embeddings_index = load_pretrained_word_vectors()

In [None]:
EMBEDDING_DIM = 50

print('[INFO] Creating the embeddings matrix...')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('[INFO] Embedding Matrix\'s shape is {}'.format(embedding_matrix.shape))

In [None]:
# MODEL

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
NUM_UNITS = 32
DROPOUT = 0.5
RECURRENT_DROPOUT = 0.5
LEARNING_RATE = 1e-3
NUM_EPOCHS = 1
DECAY = 1e-3 / NUM_EPOCHS

In [None]:
print('[INFO] Training GRU model...')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = GRU(
    units=NUM_UNITS, 
    dropout=DROPOUT, 
    recurrent_dropout=RECURRENT_DROPOUT,
    return_sequences=False,
)(embedded_sequences)
x = Dense(256, activation="relu")(x)
x = Dropout(0.5, seed=42)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.5, seed=42)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.25, seed=42)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)

print('[INFO] Model\'s Summary')
print(model.summary())

In [None]:
# COMPILE

print('[INFO] Compiling model...')

# Optimizer
opt = Adam(lr=LEARNING_RATE, decay=DECAY)

model.compile(
    loss='mean_squared_error',
    optimizer=opt,
    metrics=['mse', 'mae', 'mape'],
)

In [None]:
# FIT

print('[INFO] Fitting model...')

tensorboard = TensorBoard(log_dir='../../logs')

checkpoints = ModelCheckpoint(
os.path.join(
    '../../checkpoints',
    'weights-{epoch:02d}-{val_loss:.10f}.hdf5'),
monitor='val_mean_squared_error', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

H = model.fit(X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=NUM_EPOCHS,
    shuffle=False,
    batch_size=32,
    use_multiprocessing=True,
    workers=8,
    callbacks=[
        tensorboard,
        checkpoints,
        ]
)

In [None]:
H.history

In [None]:
# PLOT TRAINING LOSS vs ACCURACY

plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, NUM_EPOCHS), H.history["loss"], label="train_loss")
plt.plot(np.arange(0, NUM_EPOCHS), H.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, NUM_EPOCHS), H.history["mean_squared_error"], label="train_MSE")
plt.plot(np.arange(0, NUM_EPOCHS), H.history["val_mean_squared_error"], label="val_MSE")
plt.title("Training Loss and MSE")
plt.xlabel("Epoch #")
plt.ylabel("Loss/MSE")
plt.legend()
plt.savefig('{}/embeddings_loss_vs_MSE.png'.format(config.RUN_LOG_FOLD_DIR.format(fold)))

In [None]:
print('[INFO] Predicting values...')
    predicted = model.predict(X_val).flatten()