In [None]:
import re
import string
import tensorflow as tf
import numpy as np
import pandas as pd

from model import Word2Vec

from tensorflow.keras.layers import Embedding, Dot
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv("data/pokemon_cleansed.csv")
df

Unnamed: 0,english_name,description
0,bulbasaur,grass seed pokémon there is a plant seed on b...
1,ivysaur,grass seed pokémon when the bulb on ivysaurs ...
2,venusaur,grass seed pokémon venusaurs plant blooms whe...
3,charmander,fire lizard pokémon charmander has a preferen...
4,charmeleon,fire flame pokémon charmeleon has a barbaric ...
...,...,...
146,dratini,dragon dragon pokémon dratini dwells near bodi...
147,dragonair,dragon dragon pokémon dragonair lives in prist...
148,dragonite,dragon dragon pokémon dragonite is a kindhear...
149,mewtwo,psychic genetic pokémon mewtwos dna is almost...


In [22]:
# Put together a training corpus
descriptions = list(df["description"].values)

# Clean and tokenize text
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # remove punctuation
    words = text.split()  # tokenize into words
    return words

# Apply to all sentences
dataset = [preprocess_text(sentence) for sentence in descriptions]
print(dataset)



In [23]:
window_size = 10
embedding_dim = 20

epochs = 50
optimizer = Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

word2vec = Word2Vec(dataset, embedding_dim)
train_dataset = word2vec.prepare_dataset(window_size)
word2vec.compile(optimizer, loss)
word2vec.fit(train_dataset, epochs=epochs)



Word2Vec vocabulary size: 1471
Sample word pairs: [('grass', 'seed'), ('grass', 'pokémon'), ('grass', 'there'), ('grass', 'is'), ('grass', 'a')]
Batch shape: (128, 2)
Batch contents: [[1268  556]
 [1114 1429]
 [ 987  556]
 [ 758   34]
 [  11  188]
 [ 393  636]
 [1146  343]
 [1259  581]
 [ 111 1096]
 [ 276  492]
 [ 674 1406]
 [1326  223]
 [ 556 1443]
 [ 873 1171]
 [1378  343]
 [ 273 1124]
 [  11 1010]
 [1096  429]
 [ 867  942]
 [1102 1449]
 [ 849 1114]
 [ 556 1025]
 [1288  627]
 [  25 1378]
 [ 454  607]
 [1232 1423]
 [ 178 1202]
 [1096  556]
 [1081  410]
 [ 888 1096]
 [1259  178]
 [ 422  936]
 [ 627  343]
 [ 223  223]
 [ 304 1041]
 [1112  880]
 [ 521 1017]
 [ 729 1193]
 [ 989 1445]
 [1121 1422]
 [1096  986]
 [ 800 1096]
 [   6  343]
 [1378 1449]
 [1268  234]
 [1294 1349]
 [1096  181]
 [ 942   11]
 [1137 1292]
 [1112 1098]
 [1422  986]
 [1234  421]
 [  11 1096]
 [ 798   42]
 [ 998   11]
 [ 556 1114]
 [1114  223]
 [1316  760]
 [1458 1117]
 [ 585 1301]
 [ 391  223]
 [ 986 1303]
 [ 761  478

<keras.src.callbacks.History at 0x35330e850>

In [24]:
word2vec.summary()

Model: "word2_vec_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding (Embedding)  multiple                  29420     
                                                                 
 dot_8 (Dot)                 multiple                  0         
                                                                 
Total params: 29420 (114.92 KB)
Trainable params: 29420 (114.92 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# Example usage
word2vec.get_word_embedding("pikachu")

array([-0.8940277 ,  0.9868017 , -1.0189716 ,  0.9820899 , -1.1174933 ,
       -1.0046421 , -0.60882014,  0.8898803 , -1.0056114 , -0.44581392,
        0.69638526, -0.98456955,  1.0080854 ,  0.7706229 ,  0.6281304 ,
       -0.2901547 , -0.83576596, -0.98115563,  0.895597  ,  0.8978242 ],
      dtype=float32)

In [49]:
# Example usage
word2vec.compute_similarity("bulbasaur", "seed")

16.817562

In [50]:
# Example usage
word2vec.compute_similarity("bulbasaur", "fire")

18.500116