In [None]:
import numpy as np 
import tensorflow as tf 
import pandas as pd
import os
import utils

%matplotlib inline

!wandb login "41c25b4fc8e96d4ae0d96e0abd4d69787a6ea35f"
import wandb
from wandb.keras import WandbCallback
wandb.init(project="USE+decoder-esc190")
config = wandb.config


In [None]:
data_dir = "data_wordnet"
data = utils.read_dir(data_dir)
definitions, max_length, wordmap = utils.get_definitions(data)

print("Size of vocabulary: {}".format(len(data)))
print("Longest definition (words): {}".format(max_length))
print("Number of definitions: {}".format(len(definitions)))

word2num, num2word = utils.get_word_dicts(definitions)

num2word[0] = ""
vocab_size = len(list(word2num.keys()))

print("Size of definition vocabulary: {}".format(vocab_size))

def_vectors = utils.convert_word2int(definitions, word2num)

x_train = utils.defs_to_np(def_vectors, max_length)

print(x_train.shape)
vocab_size += 1

In [None]:
import tensorflow_hub as hub
#creating embeddings from definitions
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


In [None]:

word2vec = {}
for word, definitions, _ in data:
    if definitions:
        output = embed([definitions[0]])
        print(output.shape)
        word2vec[word] = np.reshape(output, (512,))

vec2word = {tuple(key): value for value, key in word2vec.items()}

import pickle

pickle.dump(vec2word, open("vec2word512", "wb"))
pickle.dump(word2vecs, open("word2vec512", "wb"))

In [None]:
pickle.dump(word2vec, open("word2vec512", "wb"))

In [None]:
#creating decoder model, taking in embedded strings, and calculating a resultant from them

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM, RepeatVector, TimeDistributed, Lambda, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy


inputlayer = Input(shape=(max_length, 512))
masking_layer = Masking()(inputlayer)
decodingLSTM1 = LSTM(32, return_sequences=True)(masking_layer)
decodingLSTM2 = LSTM(64, return_sequences=True)(decodingLSTM1)
denseboi = TimeDistributed(Dense(100, activation="relu"))(decodingLSTM2)
finalDense = TimeDistributed(Dense(vocab_size, activation="softmax"))(denseboi)
output = finalDense

model = Model(inputs=inputlayer, outputs=output)
optimizer = Adam(learning_rate = 0.0003)
model.compile(loss = categorical_crossentropy, optimizer = optimizer, metrics = ["accuracy"])

model.summary()

In [None]:
import tensorflow_hub as hub
#creating embeddings from definitions
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
import tensorflow_hub as hub
from functools import partial

def parse_defs(definition_vec, model, num2word, vocab_size):
    converted_tens = num2word.lookup(definition_vec)
    definition_string = tf.strings.join(tf.split(converted_tens, num_or_size_splits=converted_tens.shape[0], axis = 0))
    embedded_tens = model(definition_string)
    embedded_tens = tf.reshape(embedded_tens, shape=(1, 512))
    embedded_tens = tf.tile(embedded_tens, [definition_vec.shape[0], 1])
    bool_mask_tens = tf.reshape(tf.where(definition_vec > 0, 1.0, 0.0), (definition_vec.shape[0], 1))
    embedded_tens = embedded_tens * bool_mask_tens
    label = tf.one_hot(definition_vec, vocab_size)
    return embedded_tens, label



In [None]:
# creating lookup dictionary

keys = list(num2word.keys())
values = [num2word[each] for each in keys]

tf_num2word = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(keys),
        values=tf.constant(values, dtype=tf.string),
    ),
    default_value=tf.constant(""),
    name="num2wordlookup"
)

In [None]:
config.batch_size = 16
#config.steps_per_epoch = 100
config.epochs = 8

In [None]:
#creating dataset
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
dataset = tf.data.Dataset.from_tensor_slices(x_train)
print(vocab_size)
dataset = dataset.map(partial(parse_defs, model=embed, num2word=tf_num2word, vocab_size=vocab_size))
dataset = dataset.shuffle(1000).batch(config.batch_size)

In [None]:
model.load_weights("USED.FIXED1.weights.08.h5")

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
save_model = ModelCheckpoint(filepath="USED.FIXED2.weights.{epoch:02d}.h5", monitor='accuracy', save_weights_only=True, mode='auto', verbose=1)

In [None]:
wandb.init()
model.fit(dataset, epochs=config.epochs, callbacks=[WandbCallback(), save_model])

In [None]:
def output2string(words, num2word):
    stringboi = ""
    for sentence in words:
        for word in sentence:
            stringboi += num2word[word]
            stringboi += " "
        stringboi += "\n"
    return stringboi

In [None]:
for x, y in dataset:
    words = tf.keras.backend.argmax(y, axis=-1)
    words = words.numpy().tolist()
    print(output2string([words], num2word))