In [7]:
import numpy as np 
import tensorflow as tf 
import pandas as pd
import os
import utils
%matplotlib inline

In [8]:
data_dir = "data_wordnet"
data = utils.read_dir(data_dir)
definitions, max_length = utils.get_definitions(data)

print("Size of vocabulary: {}".format(len(data)))
print("Longest definition (words): {}".format(max_length))
print("Number of definitions: {}".format(len(definitions)))

word2num, num2word = utils.get_word_dicts(definitions)
vocab_size = len(list(word2num.keys()))

print("Size of definition vocabulary: {}".format(vocab_size))

def_vectors = utils.convert_word2int(definitions, word2num)

x_train = utils.defs_to_np(def_vectors, max_length)

print(x_train.shape)

Size of vocabulary: 117659
Longest definition (words): 54
Number of definitions: 135959
Size of definition vocabulary: 46948
(135959, 54)


In [9]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM, RepeatVector, TimeDistributed, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

'''
class LSTMAutoencoder(Model):
    def __init__(self, vocab_size, max_length):
        super(LSTMAutoencoder, self).__init__()
        #self.inputy = Input(shape=(None,), dtype="int32")
        self.embedding = Embedding(input_dim=50000, output_dim=64, input_length=max_length, mask_zero=True)
        self.encodingLSTM1 = LSTM(32, return_sequences=True)
        self.encodingLSTM2 = LSTM(16)
        self.repeatlayer = RepeatVector(max_length)
        self.decodingLSTM1 = LSTM(16, return_sequences=True)
        self.decodingLSTM2 = LSTM(32, return_sequences=True)
        self.denseboi = TimeDistributed(Dense(100, activation="relu"))
        self.finalDense = TimeDistributed(Dense(vocab_size, activation="softmax"))

    def call(self, inputs):
        #x = self.inputy(inputs)
        
        x = self.embedding(inputs)
        mask = self.embedding.compute_mask(inputs)
        x = self.encodingLSTM1(x, mask=mask)
        x = self.encodingLSTM2(x, mask=mask)
        x = self.repeatlayer(x)
        x = self.decodingLSTM1(x, mask=mask)
        x = self.decodingLSTM2(x, mask=mask)
        x = self.denseboi(x)
        x = self.finalDense(x)
        print(x.shape)
        return x

'''
#training embedding layer
'''
embedding = Sequential()
embedding.add(Embedding(input_dim=vocab_size+1, output_dim=64, input_length=max_length, mask_zero=True))


inputs = Input(shape=(None,))
embedding = Embedding(input_dim=vocab_size+1, output_dim=64, input_length=max_length, mask_zero=True)(inputs)
mask = Embedding(input_dim=50000, output_dim=64, input_length=max_length, mask_zero=True).compute_mask(inputs)
encodingLSTM1 = LSTM(32, return_sequences=True)(embedding, mask=mask)
encodingLSTM2 = LSTM(16)(encodingLSTM1, mask=mask)
repeatlayer = RepeatVector(max_length)(encodingLSTM2)
decodingLSTM1 = LSTM(16, return_sequences=True)(repeatlayer)
decodingLSTM2 = LSTM(32, return_sequences=True)(decodingLSTM1, mask=mask)
denseboi = TimeDistributed(Dense(100, activation="relu"))(decodingLSTM2)
finalDense = TimeDistributed(Dense(vocab_size, activation="softmax"))(denseboi)
output = finalDense
'''
inputs = Input(shape=(None,))
embedding = Embedding(input_dim=vocab_size+1, output_dim=64, input_length=max_length, mask_zero=True)(inputs)
mask = Embedding(input_dim=50000, output_dim=64, input_length=max_length, mask_zero=True).compute_mask(inputs)
encodingLSTM1 = LSTM(64, return_sequences=True)(embedding, mask=mask)
encodingLSTM2 = LSTM(32)(encodingLSTM1, mask=mask)
repeatlayer = RepeatVector(max_length)(encodingLSTM2)
decodingLSTM1 = LSTM(32, return_sequences=True)(repeatlayer, mask=mask)
decodingLSTM2 = LSTM(64, return_sequences=True)(decodingLSTM1, mask=mask)
denseboi = TimeDistributed(Dense(100, activation="relu"))(decodingLSTM2)
finalDense = TimeDistributed(Dense(vocab_size, activation="softmax"))(denseboi)
output = finalDense
#creating optimizers and loss



In [10]:
model = Model(inputs=inputs, outputs=output)
optimizer = Adam(learning_rate = 0.0003)
model.compile(loss = categorical_crossentropy, optimizer = optimizer, metrics = ["accuracy"])

#embedding.compile("rmsprop", "mse")

model.summary()
#embedding.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 64)     3004736     input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_NotEqual (TensorFlo [(None, None)]       0           input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 64)     33024       embedding[0][0]                  
______________________________________________________________________________________________

In [11]:
def to_one_hot(x):
    vocab_size = 46948 + 1
    #output = tf.zeros((x.shape)+(vocab_size,))
    #mask = np.array(x) > 0
    label = tf.one_hot(x, vocab_size)
    return x, label[:, 1:]

def predict_embeddings(x):
    x = embedding.predict(x, steps=54)
    return x, x

In [12]:
x_train = x_train.astype("int32")
#np.random.shuffle(x_train)
dataset = tf.data.Dataset.from_tensor_slices(x_train)
dataset = dataset.map(to_one_hot)
dataset = dataset.shuffle(1000).batch(8)

In [13]:
model.load_weights("32vec.5.weights.08.h5")

In [31]:
#model.fit(dataset, epochs=1, steps_per_epoch=100)

Train for 100 steps


In [None]:
model.save_weights("weights.h5")

In [14]:
testkek = x_train[:30]
output = model.predict(testkek)

In [15]:
print(output.shape)

(30, 54, 46948)


In [16]:
words = tf.argmax(output, axis=-1) + 1
print(words.shape)

(30, 54)


In [19]:
print(word2num["a"])

97


In [17]:
testkekboi = iter(testkek)
for sentence in words.numpy().tolist():
    size = 0
    print("definition: ", end =" ")
    for word in next(testkekboi):
        if word == 0:
            break
        size += 1
        print(num2word[word], end = " ")

    print("\n")
    print("encoded definition: ", end =" ")
    encsize = 0
    for word in sentence:
        if word == 0 or encsize == size:
            break
        encsize += 1
        print(num2word[word], end=" ")

    print("\n")

definition:  having the necessary means or skill or know-how or authority to do something 

encoded definition:  having the fact place or tendency or behavior or or or or something 

definition:  not having the necessary means or skill or know-how 

encoded definition:  not having the qualities force or accepted or rules 

definition:  facing away from the axis of an organ or organism 

encoded definition:  inheritance added from the axis of an organ or organism 

definition:  nearest to or facing toward the axis of an organ or organism 

encoded definition:  censorship to or depending toward the end of an organ or sounds 

definition:  facing or on the side toward the apex 

encoded definition:  facing or on the side during the apex 

definition:  facing or on the side toward the base 

encoded definition:  facing or on the side during the base 

definition:  especially of muscles 

encoded definition:  especially of muscles 

definition:  drawing away from the midline of the body or 

In [37]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]