In [0]:

from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout, CuDNNLSTM
from keras.preprocessing.sequence import pad_sequences
from keras.layers.advanced_activations import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!unzip 'drive/My Drive/train.csv.zip' 

Archive:  drive/My Drive/train.csv.zip
  inflating: train.csv               


In [0]:
BASE_DIR = 'drive/My Drive/'
TRAIN_DATA_FILE = 'train.csv'
GLOVE_EMBEDDING = BASE_DIR + 'Glove_Embeddings/glove.6B.300d.txt'
VALIDATION_SPLIT = 0.2
MAX_SEQUENCE_LENGTH = 25
MAX_NB_WORDS = 2000
EMBEDDING_DIM = 300

texts = [] 
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        if len(values[3].split()) <= MAX_SEQUENCE_LENGTH:
            texts.append(values[3])
        if len(values[4].split()) <= MAX_SEQUENCE_LENGTH:
            texts.append(values[4])
print('Found %s texts in train.csv' % len(texts))
n_sents = len(texts)

Found 783944 texts in train.csv


In [0]:
tokenizer = Tokenizer(MAX_NB_WORDS+1, oov_token='unk') #+1 for 'unk' token
tokenizer.fit_on_texts(texts)
print('Found %s unique tokens' % len(tokenizer.word_index))
## **Key Step** to make it work correctly otherwise drops OOV tokens anyway!
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= MAX_NB_WORDS} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = MAX_NB_WORDS + 1
word_index = tokenizer.word_index #the dict values start from 1 so this is fine with zeropadding
index2word = {v: k for k, v in word_index.items()}
sequences = tokenizer.texts_to_sequences(texts)
data_1 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data_1.shape)
NB_WORDS = (min(tokenizer.num_words, len(word_index))+1) #+1 for zero padding 

data_val = data_1[775000:783000]
data_train = data_1[:775000]

Found 91443 unique tokens
Shape of data tensor: (783944, 25)


In [0]:
print(data_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    3    4    2
 1236   59 1236 2001    8  572    9  774  371    9   36]


In [0]:
for i in word_index:
  if word_index[i] == 2003:
    print(i)
print(word_index.items())
print(index2word)
print(texts[0])
print(sequences[0])
print(data_1[0])
print(tokenizer.num_words)

dict_items([('unk', 2001), ('the', 2), ('what', 3), ('is', 4), ('how', 5), ('a', 6), ('i', 7), ('to', 8), ('in', 9), ('do', 10), ('of', 11), ('are', 12), ('and', 13), ('can', 14), ('for', 15), ('you', 16), ('why', 17), ('best', 18), ('my', 19), ('it', 20), ('on', 21), ('does', 22), ('which', 23), ('some', 24), ('or', 25), ('be', 26), ('if', 27), ('get', 28), ('should', 29), ('with', 30), ('have', 31), ('that', 32), ('an', 33), ('your', 34), ('from', 35), ('india', 36), ('will', 37), ('people', 38), ('who', 39), ('like', 40), ('when', 41), ('good', 42), ('at', 43), ('there', 44), ('would', 45), ('between', 46), ('about', 47), ('as', 48), ('most', 49), ('quora', 50), ('one', 51), ('way', 52), ('make', 53), ('did', 54), ('not', 55), ('where', 56), ('we', 57), ('life', 58), ('by', 59), ('any', 60), ('was', 61), ('money', 62), ('so', 63), ('time', 64), ('after', 65), ('difference', 66), ('learn', 67), ('know', 68), ('they', 69), ("what's", 70), ('me', 71), ('new', 72), ('this', 73), ('has',

In [0]:
embeddings_index = {}
f = open(GLOVE_EMBEDDING, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

glove_embedding_matrix = np.zeros((NB_WORDS+1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS+1: #+1 for 'unk' oov token
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            glove_embedding_matrix[i] = embedding_vector
        else:
            # words not found in embedding index will the word embedding of unk
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

Found 400000 word vectors.
Null word embeddings: 2


In [0]:
print(embeddings_index['unk'])

[ 3.0071e-01 -4.6867e-01 -2.0617e-01 -8.0978e-01 -2.3889e-01  2.4329e-01
  1.6538e-02 -3.5687e-02 -2.2306e-01  9.5189e-01 -3.2273e-01  2.1980e-01
 -6.7524e-02 -3.7220e-01 -3.9718e-01 -4.3861e-01  1.1967e-01 -2.9964e-01
  2.8437e-02 -8.7544e-02  1.6569e-01 -4.9451e-01 -6.2011e-01 -1.6574e-01
 -9.7218e-02 -9.9474e-02 -8.0307e-02 -3.9338e-01 -2.4195e-01  3.2023e-01
 -5.3320e-01 -4.0184e-01 -6.7135e-01 -7.8561e-02  5.5546e-01  2.9997e-01
 -9.9650e-02 -6.7035e-01  1.2669e-01 -1.8618e-01 -6.2621e-02  4.5290e-01
  3.9265e-01  2.4121e-01 -4.1474e-01 -6.1890e-01 -1.0412e-01 -3.1043e-01
 -6.6788e-03 -8.3248e-01  6.5150e-01  9.0181e-01  2.4146e-02 -7.0766e-02
 -3.9580e-01 -3.6487e-01 -2.3929e-01 -1.5145e-01  2.0777e-01  5.4671e-01
 -2.5042e-01 -6.0142e-01 -5.4820e-01  7.7249e-03 -5.3288e-01  5.0325e-01
 -1.2712e-01  1.1989e-01 -6.4584e-01  3.5576e-01  1.7496e-01  1.1838e-01
 -3.2181e-01  7.4814e-02 -9.0381e-02 -2.9843e-01  1.6798e-02 -1.2735e-01
  7.3567e-01 -1.7335e-01  3.7123e-01  3.7979e-01 -5

In [0]:
batch_size = 100
max_len = MAX_SEQUENCE_LENGTH
emb_dim = EMBEDDING_DIM
latent_dim = 64
intermediate_dim = 256
epsilon_std = 1.0
kl_weight = 100         #CHANGE THIS
num_sampled=500
act = ELU()


x = Input(shape=(max_len,))
x_embed = Embedding(NB_WORDS+1, emb_dim, weights=[glove_embedding_matrix],
                            input_length=max_len, trainable=False)(x)
h = Bidirectional(CuDNNLSTM(intermediate_dim, return_sequences=False), merge_mode='concat')(x_embed)

z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
# we instantiate these layers separately so as to reuse them later
repeated_context = RepeatVector(max_len)
decoder_h = CuDNNLSTM(intermediate_dim, return_sequences=True)
decoder_mean = Dense(NB_WORDS+1, activation='linear') #softmax is applied in the seq2seqloss by tf #TimeDistributed()
h_decoded = decoder_h(repeated_context(z))
x_decoded_mean = decoder_mean(h_decoded)


# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, max_len)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        #xent_loss = K.sum(metrics.categorical_crossentropy(x, x_decoded_mean), axis=-1)
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tf.contrib.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)#,
                                                     #softmax_loss_function=softmax_loss_f), axis=-1)#,
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        xent_loss = K.mean(xent_loss)
        kl_loss = K.mean(kl_loss)
        return K.mean(xent_loss + kl_weight * kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)
    
def kl_loss(x, x_decoded_mean):
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    kl_loss = kl_weight * kl_loss
    return kl_loss

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
opt = Adam(lr=0.01) 
vae.compile(optimizer='adam', loss=[zero_loss], metrics=[kl_loss])
vae.summary()












(?, 25) (100, 25, 2002)
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 300)      600600      input_1[0][0]                    
_________________________________________________________________________________________

In [0]:
def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + model_name + ".h5" 
    directory = os.path.dirname(filepath)
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=2, save_best_only=True)
    return checkpointer

checkpointer = create_model_checkpoint('drive/My Drive/models_250', 'vae_seq2seq_test_very_high_std')



vae.fit(data_train, data_train,
     shuffle=True,
     epochs=100,
     batch_size=batch_size,
     verbose = 2,
     validation_data=(data_val, data_val), callbacks=[checkpointer])

#print(K.eval(vae.optimizer.lr))
#K.set_value(vae.optimizer.lr, 0.01)

vae.save('drive/My Drive/models_250/vae_lstm.h5')
#vae.load_weights('models/vae_seq2seq_test.h5')

Train on 775000 samples, validate on 8000 samples
Epoch 1/100
 - 118s - loss: 61.8962 - kl_loss: 0.0049 - val_loss: 62.1455 - val_kl_loss: 0.0013

Epoch 00001: val_loss improved from inf to 62.14545, saving model to drive/My Drive/models_250/vae_seq2seq_test_very_high_std.h5
Epoch 2/100
 - 118s - loss: 61.5816 - kl_loss: 0.0052 - val_loss: 62.0417 - val_kl_loss: 6.0842e-04

Epoch 00002: val_loss improved from 62.14545 to 62.04171, saving model to drive/My Drive/models_250/vae_seq2seq_test_very_high_std.h5
Epoch 3/100
 - 118s - loss: 61.5199 - kl_loss: 0.0117 - val_loss: 62.0047 - val_kl_loss: 4.0861e-04

Epoch 00003: val_loss improved from 62.04171 to 62.00474, saving model to drive/My Drive/models_250/vae_seq2seq_test_very_high_std.h5
Epoch 4/100
 - 118s - loss: 61.4676 - kl_loss: 0.0016 - val_loss: 61.9890 - val_kl_loss: 2.6667e-04

Epoch 00004: val_loss improved from 62.00474 to 61.98897, saving model to drive/My Drive/models_250/vae_seq2seq_test_very_high_std.h5
Epoch 5/100
 - 118s

In [0]:
encoder = Model(x, z_mean)
encoder.save('drive/My Drive/models_250/encoder32dim512hid30kvocab_loss29_val34.h5')

# build a generator that can sample from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(repeated_context(decoder_input))
_x_decoded_mean = decoder_mean(_h_decoded)
_x_decoded_mean = Activation('softmax')(_x_decoded_mean)
generator = Model(decoder_input, _x_decoded_mean)

In [0]:
index2word = {v: k for k, v in word_index.items()}
index2word[0] = 'pad'

#test on a validation sentence
sent_idx = 100
sent_encoded = encoder.predict(data_val[sent_idx:sent_idx+2,:])
x_test_reconstructed = generator.predict(sent_encoded, batch_size = 1)
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[0])
#np.apply_along_axis(np.max, 1, x_test_reconstructed[0])
#np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[0]))
word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
print(' '.join(word_list))
original_sent = list(np.vectorize(index2word.get)(data_val[sent_idx]))
print(' '.join(original_sent))

pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad unk unk unk unk unk unk
pad pad pad pad pad pad pad pad pad pad pad pad pad pad pad can i go for the same career as my friend


In [0]:
# function to parse a sentence
def sent_parse(sentence, mat_shape):
    sequence = tokenizer.texts_to_sequences(sentence)
    padded_sent = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    return padded_sent#[padded_sent, sent_one_hot]

# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec

# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample

# input: original dimension sentence vector
# output: sentence text
def print_latent_sentence(sent_vect):
    sent_vect = np.reshape(sent_vect,[1,latent_dim])
    sent_reconstructed = generator.predict(sent_vect)
    sent_reconstructed = np.reshape(sent_reconstructed,[max_len,NB_WORDS+1])
    reconstructed_indexes = np.apply_along_axis(np.argmax, 1, sent_reconstructed)
    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    w_list = [w for w in word_list if w not in ['pad']]
    print(' '.join(w_list))
    #print(word_list)
     
def new_sents_interp(sent1, sent2, n):
    tok_sent1 = sent_parse(sent1, [MAX_SEQUENCE_LENGTH + 2])
    tok_sent2 = sent_parse(sent2, [MAX_SEQUENCE_LENGTH + 2])
    enc_sent1 = encoder.predict(tok_sent1, batch_size = 16)
    enc_sent2 = encoder.predict(tok_sent2, batch_size = 16)
    test_hom = shortest_homology(enc_sent1, enc_sent2, n)
    for point in test_hom:
        print_latent_sentence(point)

In [0]:
sentence1=['where can i find india']
mysent = sent_parse(sentence1, [MAX_SEQUENCE_LENGTH + 2])
mysent_encoded = encoder.predict(mysent, batch_size = 16)
print_latent_sentence(mysent_encoded)
print_latent_sentence(find_similar_encoding(mysent_encoded))

sentence2=['gogogo where can i find an extremely good restaurant endend']
mysent2 = sent_parse(sentence2, [MAX_SEQUENCE_LENGTH + 2])
mysent_encoded2 = encoder.predict(mysent2, batch_size = 16)
print_latent_sentence(mysent_encoded2)
print_latent_sentence(find_similar_encoding(mysent_encoded2))
print('-----------------')

# new_sents_interp(sentence1, sentence2, 5)

unk unk unk unk unk unk
unk unk unk unk unk unk
unk unk unk unk unk unk
unk unk unk unk unk unk
-----------------
