In [17]:
import re
import string
import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Embedding, Dot
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [5]:
df = pd.read_csv("data/pokemon_cleansed.csv")
df

Unnamed: 0,english_name,description
0,bulbasaur,grass seed pokémon there is a plant seed on b...
1,ivysaur,grass seed pokémon when the bulb on ivysaurs ...
2,venusaur,grass seed pokémon venusaurs plant blooms whe...
3,charmander,fire lizard pokémon charmander has a preferen...
4,charmeleon,fire flame pokémon charmeleon has a barbaric ...
...,...,...
146,dratini,dragon dragon pokémon dratini dwells near bodi...
147,dragonair,dragon dragon pokémon dragonair lives in prist...
148,dragonite,dragon dragon pokémon dragonite is a kindhear...
149,mewtwo,psychic genetic pokémon mewtwos dna is almost...


In [None]:
# Get training corpus
descriptions = list(df["description"].values)

# clean and tokenize text
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # remove punctuation
    words = text.split()  # tokenize into words
    return words

# Apply to all sentences
tokenized_descriptions = [preprocess_text(sentence) for sentence in descriptions]

print(tokenized_descriptions)



In [12]:
# Create a vocabulary from tokenized sentences
vocabulary = set(word for sentence in tokenized_descriptions for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocabulary)}
idx2word = {idx: word for word, idx in word2idx.items()}

print("Vocabulary size:", len(vocabulary))
print("Vocabulary words and indexes:", word2idx)

Vocabulary size: 1471


In [15]:
# Generate (center, context) word pairs
window_size = 5  # Define how many words before/after to consider
pairs = []

for sentence in tokenized_descriptions:
    for idx, center_word in enumerate(sentence):
        context_start = max(0, idx - window_size)
        context_end = min(len(sentence), idx + window_size + 1)

        for context_idx in range(context_start, context_end):
            if context_idx != idx:  # Avoid self-pairing
                pairs.append((word2idx[center_word], word2idx[sentence[context_idx]]))

print("Sample word pairs:", [(idx2word[c], idx2word[ctx]) for c, ctx in pairs[:5]])

Sample word pairs: [('grass', 'seed'), ('grass', 'pokémon'), ('grass', 'there'), ('grass', 'is'), ('grass', 'a')]


In [50]:
# Extract center and context words as separate lists
center_words, context_words = zip(*pairs)
center_words = np.array(center_words, dtype=np.int32)
context_words = np.array(context_words, dtype=np.int32)
labels = np.ones(len(center_words), dtype=np.float32)  # Positive examples

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices(((center_words, context_words), labels))
dataset = dataset.map(lambda pair, label: ((tf.stack(pair), label), label))  # Ensure correct shape
dataset = dataset.shuffle(10000).batch(128)

In [52]:
# Print dataset to verify
for element in dataset.take(1):
    batch, label = element
    print("Batch shape:", batch[0].shape)
    print("Batch contents:", batch[0].numpy())
    print("Labels:", label.numpy())

Batch shape: (128, 2)
Batch contents: [[1317  801]
 [1393 1196]
 [ 912  670]
 [1183 1393]
 [ 371  211]
 [ 722  689]
 [ 912  912]
 [ 814 1196]
 [ 713  912]
 [1332  292]
 [1271  474]
 [ 582  859]
 [ 825  769]
 [ 912  211]
 [1183  112]
 [ 284    3]
 [ 211  984]
 [ 523 1216]
 [ 691 1215]
 [ 477  606]
 [   3 1158]
 [ 171  154]
 [ 277  747]
 [ 912 1138]
 [  88  211]
 [  84 1183]
 [1354  211]
 [1232 1338]
 [1393  211]
 [1196  448]
 [ 875 1442]
 [ 997  437]
 [ 221   84]
 [ 912 1326]
 [  84 1244]
 [  84  258]
 [ 311 1217]
 [ 490 1466]
 [1466 1097]
 [ 197   90]
 [ 149 1330]
 [ 315  938]
 [ 713  606]
 [ 211   43]
 [1265 1180]
 [ 424 1365]
 [1158 1096]
 [ 277  502]
 [ 648  211]
 [ 211  306]
 [1183  391]
 [ 713  448]
 [1183 1380]
 [1424 1375]
 [ 117  912]
 [ 666  294]
 [  67   67]
 [1232  579]
 [ 912 1347]
 [1179  389]
 [ 211   75]
 [ 713 1106]
 [ 912 1148]
 [ 974  630]
 [1287   83]
 [  52   54]
 [ 211  950]
 [ 955  579]
 [  84  648]
 [1183  217]
 [1365 1263]
 [ 349 1232]
 [ 508 1196]
 [1452  292]


In [109]:
# Define embedding size
embedding_dim = 20  # Small embedding space as per your preference
vocab_size = len(word2idx)  # Vocabulary size

# Define the Word2Vec model
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, name="word_embedding")
        self.dot = Dot(axes=1, normalize=False)  # Computes similarity between embeddings
    
    def call(self, inputs):
        pair, label = inputs
        pair = tf.reshape(pair, (-1, 2))  # Ensure correct shape
        center_embedding = self.embedding(pair[:, 0])  
        context_embedding = self.embedding(pair[:, 1])
        dot_product = self.dot([center_embedding, context_embedding])
        return dot_product

# Initialize model
word2vec = Word2Vec(vocab_size, embedding_dim)

# Compile with Negative Sampling loss (Binary Cross-Entropy)
word2vec.compile(optimizer=Adam(learning_rate=0.01), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))



In [110]:
# Define labels (1 for real context words, 0 for negative samples)
labels = np.ones(len(center_words))  # Positive examples

# Train the model
word2vec.fit(dataset, epochs=20)

# Extract trained embeddings
trained_embeddings = word2vec.get_layer("word_embedding").get_weights()[0]

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [124]:
# Function to get the embedding of a word
def get_embedding(word):
    word_index = word2idx[word]
    return trained_embeddings[word_index]

def dot_product_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2)

# Example usage
word1 = "grass"
word2 = "bulbasaur"

similarity = dot_product_similarity(
    get_embedding(word1), 
    get_embedding(word2)
)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")

Cosine similarity between 'grass' and 'bulbasaur': 11.282064437866211
