<a href="https://colab.research.google.com/github/gchakri4u/DNNRepo/blob/main/Skip_Gram_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Summary**
* Skip-Gram model takes a center word as input and tries to predict its surrounding/contextual words as output
* The **first projection layer(No activation function)** is the embeddings we want to get

In [None]:
import re
import tensorflow as tf
import numpy as np

In [None]:
# Sample Text
#!wget https://raw.githubusercontent.com/mshossain/TextEmbeddings/refs/heads/main/sample_text.txt

In [None]:
def read_file(file_path):
  with open(file_path, 'r') as file:
    return file.read()

In [None]:
def tokenizer(input):
  input = input.lower()
  input = re.sub(r'[^\w\s]','',input) #Remove Punctuation like [.;)("]
  return input.split()

In [None]:
def build_word_mapping(words): # word_to_id and id_to_word mapping for every unique word
  word_to_id = {}
  id_to_word = {}
  for word in words:
    if word not in word_to_id:
      word_to_id[word] = len(word_to_id)
      id_to_word[len(id_to_word)] = word
  return word_to_id, id_to_word

In [None]:
def prepare_skipgram_data(words,word_to_id,window_size):
  output = []
  for i in range(len(words)):
    center_word = word_to_id[words[i]]
    for j in range(i - window_size, i +window_size +1):
      if j!= i and j >=0 and j < len(words):
        context_word = word_to_id[words[j]]
        output.append({center_word:context_word})
  return output

In [None]:
class SkipGramModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dims):
    super(SkipGramModel, self).__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dims)
    self.output_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs): # Forward Pass internals
    # print("Inputs:", inputs)
    # print("Inputs.shape:", inputs.shape)
    x = self.embedding(inputs) # Output shape --> (batch_size,embedding_dims)
    return self.output_layer(x) # Output shape --> (batch_size, vocab_size)

In [None]:
def create_batches_with_shuffle(data, batch_size):
  batches = []
  np.random.shuffle(data)
  for i in range(0, len(data), batch_size):
    batch = data[i:i+batch_size]
    batches.append(batch)
  return batches

In [None]:
def train(model,data,batch_size=32,epochs=10):
  for epoch in range(epochs):
    total_loss = 0
    batches = create_batches_with_shuffle(data, batch_size)
    for batch in batches:
      center_words_batch = []
      context_words_batch = []
      for d in batch:
        center_words_batch.append(list(d.keys())[0]) # Appending the first key to center_words_batch
        context_words_batch.append(list(d.values())[0])
      center_words_batch = tf.convert_to_tensor(center_words_batch)
      context_words_batch = tf.one_hot(context_words_batch, depth=len(word_to_id))
      with tf.GradientTape() as tape:
        logits = model(center_words_batch)
        loss = loss_fn(context_words_batch, logits)
      gradients = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
      total_loss += loss
    print("Epoch: {} Loss: {}".format(epoch, total_loss))

In [None]:
# Hyper Parameters
epochs = 150
batch_size = 64
learning_rate = 0.01
embedding_dims = 15
#-------------------

# 1.Read File
text = read_file('sample_text.txt')
print(text)

# 2.Tokenize the file data
words = tokenizer(text)
print(words)

# 3.Create Word to Id and Id to Word Mappings
word_to_id, id_to_word = build_word_mapping(words)
print("WORD_TO_ID:",word_to_id)
print("LEN(WORD_TO_ID):",len(word_to_id))

# 4. Create Skip-Gram data {center_word,context_word}
skipgram_data = prepare_skipgram_data(words,word_to_id,3)
print("SKIPGRAM_DATA:",skipgram_data)

# 5.Create the model
vocab_size = len(word_to_id)
model = SkipGramModel(len(word_to_id), embedding_dims)

# 6.Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.CategoricalCrossentropy()

# 7.Compile the model with optimizer and Loss Function
model.compile(optimizer=optimizer, loss=loss_fn)
model.summary()

# 8.Train the Model
train(model,skipgram_data,batch_size,epochs)
model.summary()

The cat sat on the mat.
The dog lay on the rug.
The cat chased the rat.
The dog barked at the cat.
The mat was next to the rug.
The dog and the cat slept together on the mat.
The mat and the rug were dirty.
The tree is in the backyard.
The bird flew over the trees.
The bird sang in the tree.
The bird liked to play near the tree.
The tree was tall and the bird liked to sit on it.

['the', 'cat', 'sat', 'on', 'the', 'mat', 'the', 'dog', 'lay', 'on', 'the', 'rug', 'the', 'cat', 'chased', 'the', 'rat', 'the', 'dog', 'barked', 'at', 'the', 'cat', 'the', 'mat', 'was', 'next', 'to', 'the', 'rug', 'the', 'dog', 'and', 'the', 'cat', 'slept', 'together', 'on', 'the', 'mat', 'the', 'mat', 'and', 'the', 'rug', 'were', 'dirty', 'the', 'tree', 'is', 'in', 'the', 'backyard', 'the', 'bird', 'flew', 'over', 'the', 'trees', 'the', 'bird', 'sang', 'in', 'the', 'tree', 'the', 'bird', 'liked', 'to', 'play', 'near', 'the', 'tree', 'the', 'tree', 'was', 'tall', 'and', 'the', 'bird', 'liked', 'to', 'sit', 'on

Epoch: 0 Loss: 64.00416564941406
Epoch: 1 Loss: 50.62116241455078
Epoch: 2 Loss: 44.01335906982422
Epoch: 3 Loss: 55.26911926269531
Epoch: 4 Loss: 91.08551025390625
Epoch: 5 Loss: 98.94171905517578
Epoch: 6 Loss: 97.01768493652344
Epoch: 7 Loss: 94.09891510009766
Epoch: 8 Loss: 92.01155090332031
Epoch: 9 Loss: 88.90156555175781
Epoch: 10 Loss: 78.54145812988281
Epoch: 11 Loss: 62.859466552734375
Epoch: 12 Loss: 63.133644104003906
Epoch: 13 Loss: 67.70219421386719
Epoch: 14 Loss: 63.141319274902344
Epoch: 15 Loss: 66.74856567382812
Epoch: 16 Loss: 52.896366119384766
Epoch: 17 Loss: 60.88357925415039
Epoch: 18 Loss: 68.3677978515625
Epoch: 19 Loss: 59.66046142578125
Epoch: 20 Loss: 67.49078369140625
Epoch: 21 Loss: 63.42261505126953
Epoch: 22 Loss: 67.83116149902344
Epoch: 23 Loss: 59.167057037353516
Epoch: 24 Loss: 60.898460388183594
Epoch: 25 Loss: 69.54627227783203
Epoch: 26 Loss: 60.207984924316406
Epoch: 27 Loss: 55.99566650390625
Epoch: 28 Loss: 72.39945983886719
Epoch: 29 Loss: 68

In [None]:
# Get the embeddings from the trained model
embeddings = model.embedding.get_weights()[0]

# Print the embeddings for each word
for word, idx in word_to_id.items():
  print(f"Word: {word}, Embedding: {embeddings[idx]}")

Word: the, Embedding: [ 0.10177308  0.03381066  0.09507633 -0.06960264 -0.0260177  -0.03545347
 -0.03793484  0.01664033  0.08629634 -0.01554542  0.11156334 -0.08473919
  0.1147178   0.02996657  0.1844858 ]
Word: cat, Embedding: [ 0.1611595   0.18163915  0.03886642 -0.08201448 -0.03473586  0.01416276
 -0.15370496  0.12088953 -0.05016429  0.01425326  0.09057663  0.14792831
  0.0720284  -0.06700073 -0.10452331]
Word: sat, Embedding: [ 0.1782241  -0.03263541  0.15457503  0.01060199  0.09235221 -0.00439892
 -0.1034523   0.14384605  0.08423572 -0.0361719   0.02883651  0.10898793
 -0.0173884  -0.03205472 -0.19427091]
Word: on, Embedding: [ 0.03168623  0.06772245 -0.01003242  0.17845942 -0.27230483 -0.3767427
  0.16985036 -0.0455696  -0.08540525 -0.08113692 -0.09503768 -0.13793829
 -0.13747314  0.13897769 -0.11141122]
Word: mat, Embedding: [ 3.16731542e-01 -2.49721372e-04 -3.76161262e-02  1.46496668e-01
  1.68852881e-01  3.55079286e-02 -1.24433286e-01  1.23602055e-01
 -5.15296161e-02 -1.141810

In [None]:
def cosine_similarity(word1, word2):
  if word1 not in word_to_id or word2 not in word_to_id:
    return None

  embedding1 = embeddings[word_to_id[word1]]
  embedding2 = embeddings[word_to_id[word2]]

  dot_product = np.dot(embedding1, embedding2)
  magnitude1 = np.linalg.norm(embedding1)
  magnitude2 = np.linalg.norm(embedding2)

  if magnitude1 == 0 or magnitude2 == 0:
    return 0

  return dot_product / (magnitude1 * magnitude2)

In [None]:
word1 = "cat"
word2 = "dog"
similarity = cosine_similarity(word1, word2)

if similarity is not None:
  print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")
else:
  print(f"One or both of the words are not in the vocabulary.")


Cosine similarity between 'cat' and 'dog': 0.12253345549106598


**References**
* https://developers.google.com/machine-learning/crash-course/embeddings/obtaining-embeddings
* https://medium.com/nearist-ai/word2vec-tutorial-the-skip-gram-model-c7926e1fdc09
* https://github.com/mshossain/TextEmbeddings
* https://medium.com/@stefanhebuaa/should-i-use-model-fit-or-tf-gradienttape-in-tensorflow-ec8664067a3