Training an embedding with a simple dataset where the word relationships are very clear.

In [1]:
# Needed to import from the data directories
import sys
import tensorflow as tf
import json

from pathlib import Path

parent_dir = Path().resolve().parent
sys.path.append(str(parent_dir))

In [2]:
from modules.utils import import_dataset, merge_datasets, _pre_process_sentence, visualize_embeddings
from modules.model import Word2Vec

with open("../data/eval_data_poke_and_types.json", "r") as f:
    POKEMONS_BY_TYPE = json.load(f)

In [3]:
easy = import_dataset('../data/train_data_evolutions_and_types.json')
hard = import_dataset('../data/train_data_pokedex_entries.json')

#dataset = merge_datasets(easy, hard[:1])
dataset = easy

In [22]:
window_size = 10
embedding_dim = 50

epochs = 100
# Use Adam optimizer with learning rate decay
optimizer = tf.keras.optimizers.legacy.Adam(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-3,
        decay_steps=1000,
        decay_rate=0.95
    )
)

# better with neg examples
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

word2vec = Word2Vec(dataset, embedding_dim)
train_dataset = word2vec.prepare_dataset(window_size=window_size, num_negative_samples=0)
word2vec.compile(optimizer, loss, metrics=['accuracy'])
word2vec.fit(train_dataset, epochs=epochs, verbose=1)

word2vec.summary()

Sample word pairs: [('bulbasaur', 'ivysaur'), ('bulbasaur', 'venusaur'), ('bulbasaur', 'grass'), ('bulbasaur', 'poison'), ('ivysaur', 'bulbasaur')]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epo

In [23]:
# get the embeddings matrix via the layer
word2vec.get_layer("word_embedding").get_weights()[0].shape


(128, 50)

In [24]:
# Load and organize evaluation data

all_pokemons = []
for pokemons in POKEMONS_BY_TYPE.values():
    all_pokemons.extend(pokemons)

pokemon_to_type = {}
for type_name, pokemon_list in POKEMONS_BY_TYPE.items():
    for pokemon in pokemon_list:
        pokemon_to_type[pokemon] = type_name

poke_types = [pokemon_to_type[poke] for poke in all_pokemons]
poke_and_types = {poke: poke_type for poke, poke_type in zip(all_pokemons, poke_types)}

# Remove those that aren't in the vocabulary
pokes_to_remove = []
for pokemon in poke_and_types.keys():
    if not (pokemon in word2vec.vocabulary):
        pokes_to_remove.append(pokemon)         

for poke in pokes_to_remove:
    poke_and_types.pop(poke)

In [25]:
visualize_embeddings(word2vec, dim=2, words=poke_and_types, title='Word2Vec Pokembeddings',) 

In [29]:
from modules.test_types import test_script
results = test_script(word2vec)

Pass: 98.0% (294 of 300)
 Fail: 2.0% (6 of 300)


In [30]:
results.fail_counter

6

In [19]:
# Save the model to a directory
model_save_path = "./saved_model"
word2vec.save(model_save_path)

INFO:tensorflow:Assets written to: ./saved_model/assets


INFO:tensorflow:Assets written to: ./saved_model/assets
