In [None]:
import re
import string
import pandas as pd
import tensorflow as tf

from model import Word2Vec

from tensorflow.keras.optimizers import Adam

In [None]:
df = pd.read_csv("data/pokemon_cleansed.csv")
df

In [None]:
# Put together a training corpus
descriptions = list(df["description"].values)

# Clean and tokenize text
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # remove punctuation
    words = text.split()  # tokenize into words
    return words

# Apply to all sentences
dataset = [preprocess_text(sentence) for sentence in descriptions]
print(dataset)

In [None]:
window_size = 10
embedding_dim = 10

epochs = 10
optimizer = Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

word2vec = Word2Vec(dataset, embedding_dim)
train_dataset = word2vec.prepare_dataset(window_size)
word2vec.compile(optimizer, loss)
word2vec.fit(train_dataset, epochs=epochs)

In [None]:
word2vec.summary()

In [None]:
# Example usage
word2vec.get_word_embedding("pikachu")

In [None]:
# Example usage
word2vec.compute_similarity("bulbasaur", "seed")

In [None]:
# Example usage
word2vec.compute_similarity("bulbasaur", "fire")

In [None]:
from tests.poke_types import POKEMONS_BY_TYPE

all_pokemons = []
for pokemons in POKEMONS_BY_TYPE.values():
    all_pokemons.extend(pokemons)

print(all_pokemons)

In [None]:
for pokemon in all_pokemons:
    if not (pokemon in word2vec.vocabulary):
        all_pokemons.pop(all_pokemons.index(pokemon))

In [None]:
word2vec.visualize_embeddings(dim=2, rnd_seed=123, words=all_pokemons)

In [None]:
word2vec.visualize_embeddings(dim=3, rnd_seed=123, words=all_pokemons)