In [1]:
import re
import string
import pandas as pd
import tensorflow as tf
keras = tf.keras

from model import Word2Vec

from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv("data/pokemon_cleansed.csv")
df

Unnamed: 0,english_name,description
0,bulbasaur,grass seed pokémon there is a plant seed on b...
1,ivysaur,grass seed pokémon when the bulb on ivysaurs ...
2,venusaur,grass seed pokémon venusaurs plant blooms whe...
3,charmander,fire lizard pokémon charmander has a preferen...
4,charmeleon,fire flame pokémon charmeleon has a barbaric ...
...,...,...
146,dratini,dragon dragon pokémon dratini dwells near bodi...
147,dragonair,dragon dragon pokémon dragonair lives in prist...
148,dragonite,dragon dragon pokémon dragonite is a kindhear...
149,mewtwo,psychic genetic pokémon mewtwos dna is almost...


In [3]:
# Put together a training corpus
descriptions = list(df["description"].values)

# Clean and tokenize text
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # remove punctuation
    words = text.split()  # tokenize into words
    return words

# Apply to all sentences
dataset = [preprocess_text(sentence) for sentence in descriptions]
print(dataset)



In [None]:
window_size = 10
embedding_dim = 3

epochs = 2
# Use Adam optimizer with learning rate decay
optimizer = tf.keras.optimizers.legacy.Adam(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-3,
        decay_steps=1000,
        decay_rate=0.96
    )
)

# better with neg examples
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
#loss = tf.keras.losses.CosineSimilarity()

word2vec = Word2Vec(dataset, embedding_dim)
train_dataset = word2vec.prepare_dataset(window_size)
word2vec.compile(optimizer, loss, metrics=['accuracy'])
word2vec.fit(train_dataset, epochs=epochs)

Sample word pairs: [('grass', 'seed'), ('grass', 'pokémon'), ('grass', 'there'), ('grass', 'is'), ('grass', 'a')]


2025-03-11 07:39:51.452853: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-03-11 07:39:51.452871: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-03-11 07:39:51.452876: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-03-11 07:39:51.453029: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-03-11 07:39:51.453044: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/2


2025-03-11 07:39:52.239321: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


<keras.src.callbacks.History at 0x1763b7f50>

In [6]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding (Embedding)  multiple                  4413      
                                                                 
 dot (Dot)                   multiple                  0         
                                                                 
Total params: 4413 (17.24 KB)
Trainable params: 4413 (17.24 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# Example usage
word2vec.get_word_embedding("pikachu")

array([-0.01124234,  0.07461978,  0.03582945], dtype=float32)

In [8]:
# Example usage
word2vec.compute_similarity("bulbasaur", "seed")

0.88780105

In [9]:
# Example usage
word2vec.compute_similarity("bulbasaur", "fire")

0.8291021

In [21]:
from evaluation_tests.poke_types import POKEMONS_BY_TYPE

all_pokemons = []
for pokemons in POKEMONS_BY_TYPE.values():
    all_pokemons.extend(pokemons)

print(all_pokemons)

['charmander', 'charmeleon', 'charizard', 'vulpix', 'ninetales', 'growlithe', 'arcanine', 'ponyta', 'rapidash', 'magmar', 'flareon', 'moltres', 'squirtle', 'wartortle', 'blastoise', 'psyduck', 'golduck', 'poliwag', 'poliwhirl', 'poliwrath', 'tentacool', 'tentacruel', 'slowpoke', 'slowbro', 'seel', 'dewgong', 'shellder', 'cloyster', 'krabby', 'kingler', 'horsea', 'seadra', 'goldeen', 'seaking', 'staryu', 'starmie', 'magikarp', 'gyarados', 'lapras', 'bulbasaur', 'ivysaur', 'venusaur', 'oddish', 'vileplume', 'paras', 'parasect', 'bellsprout', 'weepinbell', 'victreebel', 'exeggcute', 'exeggutor', 'tangela']


In [22]:
pokemon_to_type = {}
for type_name, pokemon_list in POKEMONS_BY_TYPE.items():
    for pokemon in pokemon_list:
        pokemon_to_type[pokemon] = type_name

poke_types = [pokemon_to_type[poke] for poke in all_pokemons]
poke_and_types = {poke: poke_type for poke, poke_type in zip(all_pokemons, poke_types)}

In [25]:
pokes_to_remove = []
for pokemon in poke_and_types.keys():
    if not (pokemon in word2vec.vocabulary):
        pokes_to_remove.append(pokemon)         

for poke in pokes_to_remove:
    poke_and_types.pop(poke)

In [26]:
len(all_pokemons)

52

In [34]:
from model_utils import visualize_embeddings

visualize_embeddings(word2vec, dim=2, words=poke_and_types, rnd_seed=5)

In [13]:

# Save the model to a directory
model_save_path = "./saved_model"
word2vec.save(model_save_path)

INFO:tensorflow:Assets written to: ./saved_model/assets


INFO:tensorflow:Assets written to: ./saved_model/assets


In [24]:
model = tf.keras.models.load_model(model_save_path)

In [25]:
model

<keras.src.saving.legacy.saved_model.load.Word2Vec at 0x351f1f5d0>