# Initial Model Testing

## Imports

In [None]:
import string

import tensorflow as tf

from fun_with_words import data as fd

## Check for a GPU

In [None]:
tf.config.list_physical_devices()

## Make basic layers needed for model

In [None]:
class Tokenizer(tf.keras.layers.Layer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def call(self, inputs):
        return tf.strings.split(inputs)
    def compute_output_shape(self, input_shapes):
        return (*input_shapes, None)

class CharacterSplitter(tf.keras.layers.Layer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def call(self, inputs):
        return tf.strings.bytes_split(inputs).to_tensor()
    def compute_output_shape(self, input_shapes):
        return (*input_shapes, None)


In [None]:
VOCAB = string.ascii_letters + string.digits + string.punctuation

class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocabulary=VOCAB, OOV="OOV", **kwargs):
        super().__init__(**kwargs)
        self.vocabulary = vocabulary
        self.lookup = tf.lookup.StaticHashTable(
            initializer=tf.lookup.KeyValueTensorInitializer(
                keys=["OOV"] + [v for v in vocabulary],
                values=list(range(1, len(vocabulary) + 2)),
            ),
            default_value=0
        )

    def compute_output_shape(self, input_shape):
        return input_shape

    def call(self, inputs):
        return self.lookup.lookup(inputs)

    def get_config(self):
        cfg = super().get_config()
        cfg.update(dict(vocabulary=self.vocabulary))
        return cfg

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [None]:
def char_level_word_embedding(filters=(3, 5, 7), vocabulary=VOCAB, OOV="OOV", name="char_level_word_embedding", **kwargs):
    inp = tf.keras.Input(shape=(), dtype=tf.string)
    t = Tokenizer(name="tokenizer")
    c = CharacterSplitter(name="character_splitter")
    e = Encoder(name="encoder")
    emb = tf.keras.layers.Embedding(input_dim=len(e.lookup.export()[0]) + 1, output_dim=100)
    flayers = [tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=100, kernel_size=f, padding="same")) for f in filters]
    
    # push input through the layers
    embedded = emb(e(c(t(inp))))
    filtered = [f(embedded) for f in flayers]
    pooled = [tf.keras.layers.TimeDistributed(tf.keras.layers.GlobalMaxPooling1D())(f) for f in filtered]
    out = (tf.keras.layers.Concatenate()(pooled))
    model = tf.keras.Model(inp, out, name=name)
    return model

## Make full model

In [None]:
class LanguageModel(tf.keras.Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.input_layer = tf.keras.Input(shape=(), dtype=tf.string)
        self.embedding = char_level_word_embedding(name="word_embedding", input_shape=(None,))
        self.lm = lm = tf.keras.layers.LSTM(1024, name="language_model")
        self.d = tf.keras.layers.Dense(300, name="dense_out")
        self.out = self.call(self.input_layer)
        super().__init__(
            inputs=self.input_layer,
            outputs=self.out,
            **kwargs,
        )

    def call(self, inputs, training=False):
        emb = self.embedding(inputs)
        lm = self.lm(emb)
        return self.d(lm)

In [None]:
lm = LanguageModel(name="full_lang_model")

In [None]:
lm.summary()

## Get dataset and train

In [None]:
data = fd.get_full_dataset()
ds = tf.data.Dataset.from_tensor_slices(data)

In [None]:
tds = ds.batch(32).map(lambda x: (x[:,0], lm.embedding(x[:,1])))

In [None]:
lm.compile(optimizer='adam', loss='mse')

In [None]:
callbacks = [tf.keras.callbacks.TensorBoard(log_dir="logs/v1", update_freq=1000, write_images=True)]

In [None]:
history = lm.fit(tds.prefetch(1), epochs=100, callbacks=callbacks)

In [None]:
history.history

## See what the model has learned (eg are there any relationships in vector space that are interesting?)

In [None]:
stuff = tf.data.Dataset.from_tensor_slices([x[1] for x in data])

In [None]:
things = [x for x in stuff.batch(100).map(lm.embedding)]

In [None]:
all_words_emb = tf.concat(things, axis=0)

In [None]:
dot = tf.keras.layers.Dot(name="sim", axes=-1, normalize=True)

In [None]:
p = lm.embedding(tf.constant(["cat"]))

In [None]:
sim = dot([p, tf.expand_dims(all_words_emb, 0)])

In [None]:
[data[i] for i in tf.math.top_k(tf.expand_dims(tf.squeeze(sim), 0), 25).indices.numpy()[0]]

In [None]:
tf.math.top_k(sim, 5).indices.numpy()[0]